본문 바로가기

Programming/Web Crawling

이미지 크롤링

from bs4 import BeautifulSoup as bs
from selenium import webdriver as wb
import time
from urllib.request import urlretrieve # 이미지경로를 파일로 저장
from selenium.webdriver.common.keys import Keys

url = 'https://search.naver.com/search.naver?where=image&sm=tab_jum&query=%EA%BC%AC%EB%A0%9B'

driver = wb.Chrome()
driver.get(url)

body = driver.find_element_by_tag_name('body')

for num in range(50):
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.3)
    
soup = bs(driver.page_source, 'lxml')

img_src = soup.select('img._img')
time.sleep(2)

img_src2 = []

for i in img_src :
    img_src1 = i['src']
    img_src2.append(img_src1)

file_no = 0

for i in range(len(img_src2)):
    
    try:
        urlretrieve(img_src2[i], './img/' + str(file_no) + '.jpg')
    except :
        continue
    
    file_no += 1
    time.sleep(2)
    
    print('%s번째 사진 저장중입니다.'%file_no)

 

스크롤 내리는 방법

# body 태그에서 스크롤을 내리기 때문에 body태그를 조건으로 주면 됨

body = driver.find_element_by_tag_name('body')

for num in range(10):
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.3)

 

from bs4 import BeautifulSoup as bs
from selenium import webdriver as  wb
from selenium.webdriver.common.keys import Keys
import time
from urllib.request import urlretrieve

url = 'https://www.google.com/search?q=%ED%8C%8C%EC%9D%B4%EB%A6%AC&source=lnms&tbm=isch&sa=X&ved=2ahUKEwib-qLPrt_nAhXQQN4KHavMDjwQ_AUoAXoECBkQAw&biw=1920&bih=969'

driver = wb.Chrome()
driver.get(url)

body = driver.find_element_by_tag_name('body')
btn_more = driver.find_element_by_class_name('mye4qd')

for num in range(100):
    try:
        btn_more.click()
        time.sleep(0.5)
    except:
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.3)
        
soup = bs(driver.page_source, 'lxml')
    
img =soup.select('img.rg_i')

img[2]['src']


img_src = []

for i in img:
    try:
        img_temp = i['src']
        img_src.append(img_temp)
    except:
        try:
            img_temp = i['data-src']
            img_src.append(img_temp)
        except:
            continue
            
file_no = 0

for j in range(len(img_src)):

    try:
        urlretrieve(img_src[j], 'C:\\Users\\SM009\\Desktop\\img/' + str(file_no) + '.jpg')
        # \를 2개씩으로 바꿈
    except:
        continue
        
    file_no += 1
    time.sleep(2)
    
    print("%s번째 사진 저장중입니다."%file_no)

'Programming > Web Crawling' 카테고리의 다른 글

sns 크롤링  (0) 2020.03.02
쇼핑 크롤링  (0) 2020.03.02
카페 모든 메뉴 가져오기  (0) 2020.03.02
카페 지점명, 주소, 전화번호 크롤링  (0) 2020.03.02
실습  (0) 2020.03.02