티스토리 뷰

반응형

https://youtu.be/fE4_G1IMpYw

다른 강의자료는 www.codingnow.co.kr/ 여기를 참고해주세요.!!

 

코딩나우

프로그래밍 교육및 개발의뢰 받습니다.

www.codingnow.co.kr

파이썬을 활용하여 파일을 읽어오고

파일 내용에서 필요한 것을 추출합니다.

여기서는 비트코인의 종목을 추출하게됩니다.

웹페이지에서 (https://www.bithumb.com/trade/order/BTC_KRW)특정 정보를

1. 파일로 저장하고 저장된 파일의 내용 중에서 종목 명을 가져옵니다.

2. 또한 beautifulsoup을 사용하여 웹 크롤링으로 데이타를 가져옵니다.

자세한 설명은 동영상을 참고하시고

소스코드는 첨부된 zip파일 또는 git server를 참고하세요.

[소스코드]

project.zip
0.04MB

1. 파일의 내용 추출하기

import pprint

def read_file(file_name):
    f = open(file_name, 'r', encoding='UTF8')
    while True:
        line = f.readline()
        if not line:
            break
        print(line)
    f.close()

# file_name = "./data/coinname_220503.html"
# read_file(file_name)

def read_file_and_parser(file_name):
    f = open(file_name, 'r', encoding='UTF8')
    lines = f.readlines()
    findLine = None
    find_krw = False
    coin_list = {}
    for line in lines:
        if not find_krw:
            if line.find('data-market="KRW"') >= 0:
                find_krw = True
            else:
                continue
        if line.find('tx_l tx_link') >= 0:
            findLine = ''

        if findLine is not None:
            findLine += line
            if findLine.find('</span>') >= 0:
                # findLine = findLine.replace('\n', '')
                print('1:',findLine)
                filter = ['\n', ' ', '"', '/KRW']
                for val in filter:
                    findLine = findLine.replace(val, '')
                print('2:', findLine)

                start = findLine.find('data-sorting=')+len('data-sorting=')
                end = findLine.find('</span>')
                findLine = findLine[start:end]
                findLine = findLine.split('>')
                coin_list[findLine[1]] = findLine[0]
                print('3:', findLine)
                print('----------------------------')
                findLine = None
                find_krw = False
    # print(coin_list)
    f.close()

    coin_list = sorted(coin_list.items())
    coin_list = dict(coin_list)
    pprint.pprint(coin_list, width=1)
    # print(coin_list)
    return coin_list

def write_to_file(coin_list, result_file_name):
    f = open(result_file_name, 'w', encoding='UTF8')
    f.write('coinlist = {\n')
    for key, data in coin_list.items():
        fdata = "'{}':'{}',\n".format(key, data)
        f.write(fdata)
    f.write('}\n')
    f.close()


file_name = "./data/coinname_220503.html"
coin_list = read_file_and_parser(file_name)

result_file_name = './result/coinlist_file.py'
write_to_file(coin_list,result_file_name)

2.Beautifulsoup 을 사용한 크롤링하기

from bs4 import BeautifulSoup as bs4
import requests
import pprint
#pip install requests
#pip install bs4

def get_coinlist_from_web(URL):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
    rq = requests.get(URL,headers=header)
    soup = bs4(rq.content, 'html.parser')

    # li_list = soup.find_all('li', class_='fvtWrap')
    li_list = soup.find_all(
                'li', 
                {'class': 'fvtWrap', 'data-market': 'KRW', 'data-fixed': 'N'}
            )
    coin_list = {}
    for li in li_list:
        # if ('data-market' in li.attrs.keys()) and li.attrs['data-market'] == "KRW":
        se = li.select('span')
        # print(se)
        coin = se[0].text.replace('/KRW', '')
        name = se[0].attrs['data-sorting']
        coin_list[coin] = name
        # print(coin, name)

    coin_list = sorted(coin_list.items())
    coin_list = dict(coin_list)
    pprint.pprint(coin_list, width=1)
    return coin_list

def write_to_file(coin_list, result_file_name):
    f = open(result_file_name, 'w', encoding='UTF8')
    f.write('coinlist = {\n')
    for key, data in coin_list.items():
        fdata = "'{}':'{}',\n".format(key, data)
        f.write(fdata)
    f.write('}\n')
    f.close()

url = 'https://www.bithumb.com/trade/order/BTC_KRW'
coin_list = get_coinlist_from_web(url)

result_file_name = './result/coinlist_web.py'
write_to_file(coin_list, result_file_name)
반응형