scraping
파이썬에는 웹과의 소통을 편하게 해주는 requests라는 라이브러리
웹 scraping
- TAG 셀렉터
- ID 셀렉터
- Class 셀렉터
id 있는경우
import requests url_hynix = "https://finance.naver.com/item/main.nhn?code=000660" response = requests.get(url_hynix) html = response.text # conda install bs4 html5lib from bs4 import BeautifulSoup soup = BeautifulSoup(html, "html5lib")
tags = soup.select("#_per")
# soup.select("#_per")
# tags # [10.58]
# tags[0] # 10.58
# tags[0].text # '10.58'
finance.naver.com/item/main.nhn?code="
def get_per(code):
url=url_finance+code
html= requests.get(url).text
soup = BeautifulSoup(html, "html5lib")
tags = soup.select("#_per")
tag = tags[0]
return float(tag.text)
print(get_per("000660"))

id 없는 경우
import requests from bs4 import BeautifulSoup url_hynix = "https://finance.naver.com/item/main.nhn?code=000660" response = requests.get(url_hynix) html = response.text soup = BeautifulSoup(html, "html5lib")
tags = soup.select("#tab_con1 > div:nth-child(3) > table > tbody > tr.strong > td > em")
tag = tags[0]
print(tag.text)
for tag in tags:
print(tag.text)

웹 API
RestfulAPI
https://api.korbit.co.kr/v1/ticker/detailed?currency_pair=btc_krw
https://api.korbit.co.kr/v1/ticker/detailed?currency_pair=xrp_krw
Json –> Dictionary

import requests url = "https://api.korbit.co.kr/v1/ticker/detailed?currency_pair=btc_krw" response = requests.get(url) # result = response.text # string result = response.json() # dictionary
import datetime as dt
daytime = dt.datetime.fromtimestamp(result['timestamp']/1000)
daytime.year
daytime.second
daytime.microsecond
daytime.weekday() # 0:월요일 1:화요일...
daytime.strftime(f"%B %m월 %d일 %A") # 'March 03월 14일 Monday'
x= daytime.strptime("2017-01-02 14:44", "%Y-%m-%d %H:%M")
x.hour
x.minute
브라우저에서만 볼수 있는 경우
Ajax (Asynchronous JavaScript and XML)로 데이터를 요청하는 것이기 때문
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
url='https://finance.naver.com/item/sise_day.nhn?code=066570'
# 브라우저 인 것 처럼 요청을 보냅니다. (브라우저가 아니면 응답을 하지 않기 때문에 )
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'}
res = requests.get(url, headers=headers)
html = bs(res.text, 'lxml')
html_table = html.select("table")
tables = pd.read_html(str(html_table))
tables[0].dropna()
tables[1]