본문 바로가기

Crawling

(9)
sns 크롤링 from selenium import webdriver as wb from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup as bs import time url = 'https://www.instagram.com/' driver = wb.Chrome() driver.get(url) # 손으로 로그인 input_search = driver.find_element_by_css_selector('input.XTCLo') input_search.send_keys('찾는키워드') soup = bs(driver.page_source, 'lxml') title_list = soup.select('span._28KuJ + div spa..
이미지 크롤링 from bs4 import BeautifulSoup as bs from selenium import webdriver as wb import time from urllib.request import urlretrieve # 이미지경로를 파일로 저장 from selenium.webdriver.common.keys import Keys url = 'https://search.naver.com/search.naver?where=image&sm=tab_jum&query=%EA%BC%AC%EB%A0%9B' driver = wb.Chrome() driver.get(url) body = driver.find_element_by_tag_name('body') for num in range(50): body.send_..
카페 모든 메뉴 가져오기 # 스타벅스 홈페이지 -> menu클릭 from selenium import webdriver as wb from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup as bs import time import pandas as pd url = 'https://www.istarbucks.co.kr/index.do' driver = wb.Chrome() driver.get(url) driver.find_element_by_class_name('gnb_nav02').click() #자세히 보기 클릭 driver.find_element_by_class_name('menu_drink_btn01').click() #영양정보 보기 클릭 ..
카페 지점명, 주소, 전화번호 크롤링 from selenium import webdriver as wb from bs4 import BeautifulSoup as bs import time import pandas as pd url = 'http://www.istarbucks.co.kr/store/store_map.do' driver = wb.Chrome() driver.get(url) #지역검색 버튼 클릭 btn_search = driver.find_element_by_class_name('loca_search') #btn_search = driver.find_element_by_xpath('//*[@id="container"]/div/form/fieldset/div/section/article[1]/article/header[2]/h3/..
도시락 크롤링 from selenium import webdriver as wb from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup as bs import time import pandas as pd url = 'https://www.hsd.co.kr/menu/menu_list' driver = wb.Chrome() driver.get(url) # 예외처리(try except문) # 더보기 버튼요소를 3번 클릭하기 btn_more = driver.find_element_by_class_name('c_05') try: for index in range(50): btn_more.click() time.sleep(2) #2초동안 멈춤 e..
한달동안의 영화 평점 수집 import requests as req from bs4 import BeautifulSoup as bs import pandas as pd movie_date = [] movie_title = [] movie_rate = [] for day in range(20191201,20191226,1): url = "https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=cur&tg=0&date="+str(day) res = req.get(url) soup = bs(res.content, 'lxml') title_list = soup.select('div.tit5 > a') rate_list = soup.find_all('td',class_='point') for ind..
영화랭킹 페이지에서 제목, 평점 수집하기 import requests as req from bs4 import BeautifulSoup as bs import pandas as pd url = 'https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=cur&date=20191228' res = req.get(url) #파서 종류 : lxml, html.parser, html5lib soup = bs(res.content, 'lxml') name = soup.select('div.tit5 > a') rate = soup.find_all('td',class_='point') len(name),len(rate) #순위, 영화제목, 평점 수집 rank_list = [] name_list = [] rating_l..
인코딩 방식 3 가지 # 인코딩 방식은 아래 3가지 중 한가지 사용 music.to_csv('music50.csv', encoding='euc-kr') music.to_csv('music50_utf8.csv', encoding='utf-8-sig') music.to_csv('music50_utf8.csv', encoding='')