우리말샘 API 2020. 06 .24

package kr.co.shineware.komoran.tutorials;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;

public class ShinTest3 {

    public static void main(String[] args) throws Exception {

        String input = "“당근”, ‘양파sdfgs’, 오4535이’s, 2345수dfgsv박!@#%@$^, /참$%^외^&*/";
        // 컴마로 구분된 전처리 전 입력 변수 input

        String[] str = input.split(",");
        // 컴마 기준으로 나누어서 배열 str 생성
        String[] modifdStr = new String[str.length];
        // 전처리 결과를 담을 배열 modifdStr 생성


        int i = 0;
        for (String e : str) {
            modifdStr[i] = StringReplace(e);
            // 특수문자 제거
            modifdStr[i] = modifdStr[i].trim();
            // 공백문자 제거
            i++;
        }

        for (String e : modifdStr) {
            System.out.println(e);
        }
        System.out.println();

        for (int j = 0; j < modifdStr.length; j++) {
            try {
            // API 참조 url
                String url = "https://opendict.korean.go.kr/api/search?certkey_no=1575&key
                		=412ADB218892DA0D337ECE3DC3AA2C98&target_type=search&part
                        =word&q=" + modifdStr[j] + "&sort=dict&start=1&num=10";
                String result = "word,pos,definition,type";

                DocumentBuilderFactory dbFactoty = DocumentBuilderFactory.newInstance();
                // DocumentBuilderFactory : DocumentBuilder(DOM파서)를 생성시키는 Factory class
                DocumentBuilder dBuilder = dbFactoty.newDocumentBuilder();
                // DocumentBuilder : DOM 파서 객체의 class
                Document doc = dBuilder.parse(url);
                // url에서 파싱하여 변수 doc로 담기

	    		doc.getDocumentElement().normalize();
        		// 태그 안의 공백 제거



                Element rootElement = doc.getDocumentElement();	
                // rootElement
                NodeList itemList = rootElement.getElementsByTagName("item");
                // item node list

                String word = "";
                for (int k = 0; k < itemList.getLength(); k++) {
                // 순서대로 item nodeList 처리
                    Node item = itemList.item(k);
                    // 처리 중인 item node
                    NodeList childNodes = item.getChildNodes();
                    // 처리 중인 item node의 하위 nodeLIst

                    for (int l = 0; l < childNodes.getLength(); l++) {
                    // 순서대로 item node의 하위 nodeList 처리
                        Node info = childNodes.item(l);
                        // item node 하위 nodeList의 처리중인 node
                        if (info.getNodeType() == Node.ELEMENT_NODE) {
                        // node 형식이 element일 때의 조건
                            Element element = (Element) info;
                            // 처리 중인 item node 하위 nodeList의 처리중인 node의 element 형 변환

                            if (element.getTagName() == "word") {
                            // element의 태그 이름이 word인 조건
                                word = "\n" + element.getTextContent();
                                // 변수 word로 태그 내부를 String으로 담기
                            } else {
                            // element의 태그 이름이 word가 아닌 senese인 조건
                                result = result + word;
                                // sense마다 반복하여 word 내용을 앞에 서술
                                String pos = ",";
                                String definition = ",";
                                String type = ",";
                                NodeList senseList = element.getChildNodes();
                                // sense 태그 하위 nodeList 생성
                                for (int m = 0; m < senseList.getLength(); m++) {
                                // 순서대로 sense 태그 하위 nodeList 처리
                                    Node sense = senseList.item(m);
                                    // 처리 중인 sense 태그 하위 node
                                    if (sense.getNodeType() == Node.ELEMENT_NODE) {
                                    // node 형식이 element일 때의 조건
                                        Element senseElement = (Element) sense;
                                        // 처리 중인 sense 태그 하위 node의 element 형 변환

                                        if (senseElement.getTagName() == "pos") {
                                        // element의 태그 이름이 pos인 조건
                                            pos = pos + senseElement.getTextContent();
                                        } else if (senseElement.getTagName() == "definition") {
                                        // element의 태그 이름이 definition인 조건
                                            definition = definition + StringReplace2(
                                            		senseElement.getTextContent());
                                        } else if (senseElement.getTagName() == "type") {
                                        // element의 태그 이름이 type인 조건
                                            type = type + senseElement.getTextContent();
                                        }
                                    }
                                }
                                result = result + pos + definition + type;
                                // 출력 변수 result에 담기
                            }
                        }
                    }
                }

                BufferedOutputStream bs = null;
                try {
                    bs = new BufferedOutputStream(new FileOutputStream("/home/ehdrud1129/
                    		다운로드/20200624_우리말샘_결과" + (j + 1) + "_노동경.csv"));
                    bs.write(result.getBytes());
                    //Byte형으로만 넣을 수 있음

                } catch (Exception e) {
                    e.getStackTrace();
                    // TODO: handle exception
                } finally {
                    bs.close();
                }
            } catch (
                    Exception e) {
                e.printStackTrace();
            }    // try~catch end
        }

    }

    public static String StringReplace(String str) {
    // 한글이 아닌 문자열 제거
        String match = "[^\uAC00-\uD7A3\\s]";
        str = str.replaceAll(match, "");
        return str;
    }

    public static String StringReplace2(String str) {
    // 한글, ~, 숫자, 점이 아닌 문자열 제거
        String match = "[^\uAC00-\uD7A3~.0-9\\s]";
        str = str.replaceAll(match, "");
        return str;
    }

    // tag값의 정보를 가져오는 메소드
    private static String getTagValue(String tag, Element eElement) {
    // tag값의 정보를 가져오는 메소드
        NodeList nlList = eElement.getElementsByTagName(tag).item(0).getChildNodes();
        // 엘리멘트에서 찾는 이름의 태그로부터 0번째 배열의 자식노드 리스트 생성
        Node nValue = (Node) nlList.item(0);
        // 노드 리스트 0번 째 노드
        if (nValue == null)
            return null;
        return nValue.getNodeValue();
    }

    // tag값의 정보를 가져오는 메소드
    private static String getTagValue2(String tag, Element eElement) {
        NodeList nlList = eElement.getElementsByTagName(tag).item(0).getChildNodes();
        Node nValue = (Node) nlList.item(0);
        if (nValue == null)
            return null;
        return nValue.getNodeValue();
    }
}
저작자표시 비영리 변경금지 (새창열림)
'Programming > Java' 카테고리의 다른 글

국립국어원_HTTP적용 2020. 06 .24 (0)	2020.06.29
HttpClient 또오해영 (0)	2020.06.29
국립국어원 API 2020. 06 .24 (0)	2020.06.29
신문기사 자연어 처리 2020. 06 .24 (0)	2020.06.29
JAVA 환경설정 (0)	2020.03.13
지식과 경험을 공유 해보자.

우리말샘 API 2020. 06 .24

'Programming > Java' 카테고리의 다른 글

티스토리툴바

우리말샘 API 2020. 06 .24

'Programming > Java' 카테고리의 다른 글

'Programming/Java' Related Articles

티스토리툴바