국립국어원 API 2020. 06 .24

package kr.co.shineware.komoran.tutorials;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;

public class ShinTest2 {

    public static void main(String[] args) throws Exception {

        String input = "“당근”, ‘양파sdfgs’, 오4535이’s, 2345수dfgsv박!@#%@$^, /참$%^외^&*/";	// 컴마로 구분된 전처리 전 입력 변수 input


        String[] str = input.split(",");
        // 컴마 기준으로 나누어서 배열 str 생성
        String[] modifdStr = new String[str.length];
        // 전처리 결과를 담을 배열 modifdStr 생성

        
        int i = 0;
        for (String e : str) {
            modifdStr[i] = StringReplace(e);
            // 특수문자 제거
            modifdStr[i] = modifdStr[i].trim();
            // 공백문자 제거
            i++;
        }


        for (int j = 0; j < modifdStr.length; j++) {
            try {
                String url = "https://stdict.korean.go.kr/api/search.do?certkey_no=1572&key
                		=14F296AB6ADF1D06A38A8F1B1BF75207&mehtod=include&type_search
                		=search&q=" + modifdStr[j];
                // API 참조 url

                DocumentBuilderFactory dbFactoty = DocumentBuilderFactory.newInstance();
                // DocumentBuilderFactory: DocumentBuilder(DOM파서)를 생성시키는 Factory class
                DocumentBuilder dBuilder = dbFactoty.newDocumentBuilder();
                // DocumentBuilder : DOM 파서 객체의 클래스
                Document doc = dBuilder.parse(url)
                // url에서 파싱하여 변수 doc로 담기

                doc.getDocumentElement().normalize();
                // 태그 안의 공백 제거


                String result = "word,pos,definition,type";
                NodeList nList = doc.getElementsByTagName("item");
                // item 태그의 노드 리스트 생성

                for (int temp = 0; temp < nList.getLength(); temp++) {
                    Node nNode = nList.item(temp);
                    // item 리스트 순서대로 노드 처리
                    if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                    // 노드 형식이 엘리멘트일 때의 조건
                        Element eElement = (Element) nNode;
                        // 처리 중인 item 노드의 엘리멘트 형 변환
                        result = result + "\n" + (getTagValue("word", eElement).trim() + "," 
                        		+ getTagValue("pos", eElement).trim() + "," + 
                                StringReplace2(getTagValue("definition", eElement).trim())
                                + "," + getTagValue("type", eElement).trim());

                    }    // if end
                }    // for end

                BufferedOutputStream bs = null;
                try {
                    bs = new BufferedOutputStream(new FileOutputStream("/home/ehdrud1129/
                    		다운로드/20200624_국립국어원_결과" + (j+1) + "_노동경.csv"));
                    bs.write(result.getBytes());
                    // Byte형으로만 넣을 수 있음

                } catch (Exception e) {
                    e.getStackTrace();
                    // TODO: handle exception
                } finally {
                    bs.close();
                }
            } catch (
                    Exception e) {
                e.printStackTrace();
            }    // try~catch end
        }
    }

    public static String StringReplace(String str) {
    // 한글이 아닌 문자열 제거
        String match = "[^\uAC00-\uD7A3\\s]";
        str = str.replaceAll(match, "");
        return str;
    }

    public static String StringReplace2(String str) {
    // 한글, 점이 아닌 문자열 제거
        String match = "[^\uAC00-\uD7A3.\\s]";
        str = str.replaceAll(match, "");
        return str;
    }

    private static String getTagValue(String tag, Element eElement) {
    // tag값의 정보를 가져오는 메소드
        NodeList nlList = eElement.getElementsByTagName(tag).item(0).getChildNodes();
        // 엘리멘트에서 찾는 이름의 태그로부터 0번째 배열의 자식노드 리스트 생성
        Node nValue = (Node) nlList.item(0);
        // 노드 리스트 0번 째 노드
        if (nValue == null)
            return null;
            
        return nValue.getNodeValue();
    }
}

저작자표시 비영리 변경금지 (새창열림)

'Programming > Java' 카테고리의 다른 글

HttpClient 또오해영 (0)	2020.06.29
우리말샘 API 2020. 06 .24 (0)	2020.06.29
신문기사 자연어 처리 2020. 06 .24 (0)	2020.06.29
JAVA 환경설정 (0)	2020.03.13
Stack과 Heap (0)	2020.03.13

지식과 경험을 공유 해보자.

국립국어원 API 2020. 06 .24

'Programming > Java' 카테고리의 다른 글

티스토리툴바

국립국어원 API 2020. 06 .24

'Programming > Java' 카테고리의 다른 글

'Programming/Java' Related Articles

티스토리툴바