본문 바로가기
E | ngineering

Selenium으로 Crawl

by 덞웖이 2024. 10. 4.

환경

- Ubuntu Server 22.04
- VSCode Insider
- Jupyter Extension
- Remote SSH Extension
- Python 3.10 venv

사전 세팅

# Headless를 사용해야해서 몇 가지 절차가 추가됨
# 크롬 설치
sudo apt update
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo dpkg -i google-chrome-stable_current_amd64.deb
rm google-chrome-stable_current_amd64.deb

# 드라이버 설치
google-chrome --version # 버전 확인 후 아래에 적용
wget https://storage.googleapis.com/chrome-for-testing-public/<버전>/linux64/chromedriver-linux64.zip
unzip chromedriver_linux64.zip
sudo mv chromedriver_linux64/chromedriver /usr/local/bin/
sudo chmod +x /usr/local/bin/chromedriver
rm -r chromedriver_linux64 && rm chromedriver_linux64.zip

# 주피터에서 % 붙여서 쓰거나, 터미널의 경우 사용할 커널(venv, conda 등) 활성화 후 실행
pip install selenium webdriver-manager -q

패키지

# driver loading
from webdriver_manager.chrome import ChromeDriverManager

# browser execution
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# for interfacing
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains

# render-waits
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# exceptions for loops
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Headless로 실행하기 위한 옵션
chrm_options = Options()
chrm_options.add_argument("--headless")
chrm_options.add_argument("--no-sandbox")
chrm_options.add_argument("--disable-dev-shm-usage")

실행

  • 요소만 가져오기
with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrm_options) as driver:
    driver.get('https://indistreet.com/live?sortOption=startDate%3AASC')
    idx = 1
    xpath = f'//*[@id="__next"]/div/main/div[2]/div/div[4]/div[1]/div[{idx}]/div/a/div[2]/p[1]'
    
    # wait for specific target render, max 5 sec
    # also check 'driver.implicitly_wait(5)' for simpler use cases
    target = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, xpath)))
    while target: # raise 하지말고 그냥 None으로 리턴 해주지 ...☢
        print(target.text)
        idx += 1
        xpath = f'//*[@id="__next"]/div/main/div[2]/div/div[4]/div[1]/div[{idx}]/div/a/div[2]/p[1]'
        try:
            target = driver.find_element(By.XPATH, xpath)
        except NoSuchElementException:
            print("✋End of list🚫")
            break
  • 입력 인터페이스 사용
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrm_options)
driver.get('https://www.naver.com/') # hashcode사이트 안되잖아 ...😫
time.sleep(1)

# 로그인 버튼
xpath = '//*[@id="account"]/div/a'

try:
	# 클릭될때까지 wait
    button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, xpath)))
    if button.is_displayed() and button.is_enabled():
    	# 액션 1개면 체인 안 써도 됨🙄
        button.click()
        time.sleep(1)
		
        # 틀린/없는 계정 전달
        id = driver.find_element(By.ID, 'id')
        pw = driver.find_element(By.ID, 'pw')
        login_btn = driver.find_element(By.ID, 'log.login')
        ActionChains(driver)\
        	.send_keys_to_element(id, 'test')\
            	.send_keys_to_element(pw, 'test')\
            	.click(login_btn).perform()
        time.sleep(1)
        
        # 에러 출력
        error_xpath = '//*[@id="err_common"]/div'
        error = driver.find_element(By.XPATH, error_xpath)
        print(error.text)
    else:
        print("not visible or not enabled")
        
except TimeoutException:
    print("Was not clickable within the period")
    
except NoSuchElementException:
    print("Element not found")
    
finally:
    driver.quit()

네이버 (영문 버전) 로그인 에러가 잘 출력 되는 것 확인

 

'E | ngineering' 카테고리의 다른 글

Seaborn 활용 2  (0) 2024.10.04
Seaborn 활용 1  (0) 2024.10.04
BS4만으로 Crawl  (0) 2024.10.03
자료구조/알고리즘  (0) 2024.10.01
test3  (1) 2024.01.18