펭귄컴퓨팅/프로그래밍

Python + Playwright로 웹스크래핑(WebScraping) 예제 (Async 방식)

훅크선장 2023. 8. 29. 16:15

import os
import asyncio
from playwright.async_api import async_playwright
from datetime import datetime

# 현재 실행 디렉토리를 가져온다.
current_dir = os.getcwd()
# 디렉토리의 맨 마지막에 / 구분자의 존재 여부를 확인하여, 항상 / 로 끝나도록 만든다.
if current_dir[-1] != '/':
current_dir = current_dir + '/'
# 현재 실행 디렉토리를 출력한다.
#print(current_dir)

# 비동기 실행 메인 함수 
async def main():
    # 현재 날짜와 시간을 얻어오고, 년월일 시분초 형식으로 구성한다.
    current_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    #print("Current date & time : ", current_datetime)
    # 년월일 시분초 형식으로 문자열을 생성한다.
    str_current_datetime = str(current_datetime)
    
    # 비동기 playwright 객체를 가지고 실행
    async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True) # 브라우저 생성
            page = await browser.new_page() # 브라우저의 페이지 생성

            await page.goto("https://hook.tistory.com/", timeout=10000) # ※ 접속불가시 timeout 에러 발생!! 에러처리 필요

            # 접속된 페이지의 스크린샷을 뜨고, 지정된 파일이름으로 현재 실행디렉토리 밑에 생성한다.
            await page.screenshot(path=current_dir + 'capture/'+ f'example-chromium-{str_current_datetime}.png')
            print(await page.title())

            #print(await page.locator(
            #       "//div[@class='entry']/div[@class='titleWrap']/div[@class='info']/span[@class='date']"
            #       ).all_text_contents())
            #print(await page.locator(
            #       "//div[@class='entry']/div[@class='titleWrap']/h2/a"
            #       ).all_text_contents())
            #print(await page.locator(
            #       "//div[@class='entry']/div[@class='titleWrap']/h2/a"
            #       ).get_attribute('href').all())
            for a_loc in await page.locator("//div[@class='entry']/div[@class='titleWrap']").all():
                print(await a_loc.locator("//h2/a").text_content())
                print(await a_loc.locator("//div[@class='info']/span[@class='date']").text_content())
                print(await a_loc.locator("//h2/a").get_attribute('href'))
                print("----------------------------------------------------------------------------------------")

            await browser.close() # 브라우저를 종료

# 비동기 실행 
asyncio.run(main())