신문기사 자연어 처리 2020. 06 .24

package kr.co.shineware.komoran.tutorials;

import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL;
import kr.co.shineware.nlp.komoran.core.Komoran;
import kr.co.shineware.nlp.komoran.model.KomoranResult;
import kr.co.shineware.nlp.komoran.model.Token;

import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.List;

public class ShinTest {
    public static void main(String[] args) throws Exception {

        Komoran komoran = new Komoran(DEFAULT_MODEL.STABLE);
        // 형태소 분석 시 사용될 기분석 사전을 로드, 형태소 분석 진행 전에 로드
        komoran.setFWDic("user_data/fwd.user");
        // 형태소 분석 시 사용될 사용자 사전을 로드, 형태소 분석 진행 전에 로드
        komoran.setUserDic("user_data/dic.user");


        for (int k = 1; k < 6; k++) {
        // 기사 5개(텍스트 파일) 불러오기
            String input = "";
            try {
                String filePath = "/home/ehdrud1129/다운로드/20200624_자연어처리_기사"
                					+ k + "_노동경";
                // 읽어오는 파일 경로
                FileInputStream fileStream = null;
                // 파일 스트림
                fileStream = new FileInputStream(filePath);
                // 파일 스트림 생성

                byte[] readBuffer = new byte[fileStream.available()];
                //버퍼 선언
                while (fileStream.read(readBuffer) != -1) {
                }
                input = new String(readBuffer);

                fileStream.close();
                //스트림 닫기
            } catch (Exception e) {
                e.getStackTrace();
            }

            
            input = StringReplace(input);
            // 특수문자 제거
            input = input.replaceAll(System.getProperty("line.separator"), "");
            // 개행문자 제거


            KomoranResult analyzeResultList = komoran.analyze(input);
            // String을 komoran메소드로 분석하여 담을 객체 생성
            List<Token> tokenList = analyzeResultList.getTokenList();
            // Token 형태의 리스트 객체 생성

            String tokenized = "morph,pos,beginIndex~endIndex";
            // 출력 결과를 담을 tokenized변수 생성

            for (Token token : tokenList) {
            // Token 리스트를 순서대로 분석
                if (!token.getMorph().equals("")) {
                // 공백은 제외하는 조건
				// Komoran 메소드를 활용한 요소 분석
                    tokenized = tokenized + "\n" + (token.getMorph() + "," + token.getPos()
                    		+ "," + token.getBeginIndex() + "~" + token.getEndIndex());
                }
            }


            BufferedOutputStream bs = null;
            try {
                bs = new BufferedOutputStream(new FileOutputStream("/home/ehdrud1129/다운로드/
                		20200624_자연어처리_결과" + k + "_노동경.csv"));
                bs.write(tokenized.getBytes());
                // Byte형으로 분석 결과 변수 tokenized를 text파일로 생성
            } catch (Exception e) {
                e.getStackTrace();
                // TODO: handle exception
            } finally {
                bs.close();
            }
        }
    }

    public static String StringReplace(String str) {
    // 한글, 숫자, 소문자, 대문자만 (^)남기고 나머지를 지우는 메소드
        String match = "[^\uAC00-\uD7A30-9a-zA-Z\\s]";
        str = str.replaceAll(match, "");
        return str;
    }
}

저작자표시 비영리 변경금지

'Programming > Java' 카테고리의 다른 글

우리말샘 API 2020. 06 .24 (0)	2020.06.29
국립국어원 API 2020. 06 .24 (0)	2020.06.29
JAVA 환경설정 (0)	2020.03.13
Stack과 Heap (0)	2020.03.13
JVM의 메모리모델 (0)	2020.03.13

지식과 경험을 공유 해보자.

신문기사 자연어 처리 2020. 06 .24

'Programming > Java' 카테고리의 다른 글

티스토리툴바

신문기사 자연어 처리 2020. 06 .24

'Programming > Java' 카테고리의 다른 글

'Programming/Java' Related Articles

티스토리툴바