-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
188 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import time\n", | ||
"import random\n", | ||
"import datetime\n", | ||
"# dc_crawling.py 같은 폴더에 있다고 가정\n", | ||
"from dc_crawling import *\n", | ||
"from kafka import KafkaProducer" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"bootstrap_servers = ['localhost:9092','localhost:9093'] # kafka broker ip\n", | ||
"topicName = 'DC'\n", | ||
"producer = KafkaProducer(bootstrap_servers = bootstrap_servers)\n", | ||
"\n", | ||
"for t in ACODE.keys():\n", | ||
" dcs = DC_crawling(t, 100)\n", | ||
" for i in dcs:\n", | ||
" print(i)\n", | ||
" producer.send(topicName, str(i).encode())\n", | ||
"\n", | ||
" tim = datetime.date.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", | ||
" producer.send(topicName, tim.encode())\n", | ||
"\n", | ||
" # time.sleep(1)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"name": "python" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import requests | ||
from urllib import request | ||
from bs4 import BeautifulSoup | ||
import time | ||
import pandas as pd | ||
# from pymongo import MongoClient | ||
|
||
|
||
ACODE = {"호텔신라":"008770","두산퓨얼셀":"336260","한화솔루션":"009830", | ||
"에코프로":"086520","신성이엔지":"011930","유니슨":"018000", | ||
"카카오":"035720","네이버":"035420","엔씨소프트":"036570", | ||
"sds":"018260","삼성전자":"005930","SK하이닉스":"000660", | ||
"skc":"011790","한솔케미칼":"014680","DB하이텍":"000990", | ||
"롯데쇼핑":"023530","신세계":"004170","삼성물산":"028260", | ||
"동서":"026960","kb금융":"105560","한국금융지주":"071050","신한지주":"055550", | ||
"우리금융지주":"316140","하나금융지주":"086790"} | ||
|
||
|
||
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'} | ||
# RESULT = [] | ||
# client = MongoClient('localhost', 27017) | ||
# MongoDB = client['DC'] | ||
# collection = MongoDB['DC_title_crawl'] | ||
|
||
# 멈출 날짜 | ||
# DATE_STOP = '2021-09-20' | ||
DATE_STOP = '2021-08-19' | ||
|
||
def title_crawl(ls, ticker, result): | ||
global DATE_STOP | ||
|
||
for i in range(0,len(ls)-1): | ||
response = requests.get('https://gall.dcinside.com' + str(ls[i]),headers = HEADERS) | ||
soup = BeautifulSoup(response.content,'html.parser') | ||
items = soup.find_all("table",{'class':'gall_list'}) | ||
contents = soup.find('tbody').find_all('tr') | ||
|
||
for j in contents: | ||
if j.find('td',{'class':'gall_writer ub-writer'}).text=='운영자': | ||
pass | ||
else: | ||
# 날짜 | ||
date_dict = j.find('td',{'class':'gall_date'}).attrs | ||
|
||
if date_dict['title'][:10] <= DATE_STOP: | ||
return -1 | ||
else: | ||
# Date.append(date_dict['title']) | ||
# 제목 | ||
title = j.find('a').text | ||
# 추천수 | ||
recommend_tag = j.find('td', class_='gall_recommend') | ||
recommend = recommend_tag.text | ||
# Rec.append(recommend) | ||
|
||
# 조회수 | ||
views_tag = j.find('td', class_='gall_count') | ||
views = views_tag.text | ||
# View.append(views) | ||
|
||
put_data = { | ||
'code': ACODE[ticker], | ||
'date' : date_dict['title'], | ||
'title' : title, | ||
'view' : views, | ||
'recommend' : recommend | ||
} | ||
|
||
result.append(put_data) | ||
# Mongodb에 넣는 코드 | ||
# MongoDB.DC_title_crawl.insert_one(put_data) | ||
|
||
|
||
# keyword 바꾸기 | ||
def DC_crawling(ticker, page): | ||
RESULT = [] | ||
url = f"https://gall.dcinside.com/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}" | ||
|
||
a = f'/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}' | ||
response = requests.get(url,headers=HEADERS) | ||
soup = BeautifulSoup(response.text,'html.parser') | ||
items = soup.find('div',{'class':"bottom_paging_box"}) | ||
|
||
#리스트 만들기 url | ||
url_list=[a] | ||
for i in items.find_all('a'): | ||
url_url = i['href'] | ||
url_list.append(url_url) | ||
|
||
# 첫페이지 | ||
title_crawl(url_list, ticker, RESULT) | ||
|
||
k = 0 | ||
while k < page : | ||
response = requests.get('https://gall.dcinside.com' + str(url_list[-1]),headers=HEADERS) | ||
soup = BeautifulSoup(response.content,'html.parser') | ||
items = soup.find('div',{'class':"bottom_paging_box"}) | ||
url_list=[str(url_list[-1])] | ||
|
||
for i in items.find_all('a'): | ||
url_url = i['href'] | ||
url_list.append(url_url) | ||
url_list.pop(1) | ||
|
||
if title_crawl(url_list, ticker) == -1: | ||
break | ||
else: | ||
k += 1 | ||
|
||
return RESULT | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
for t in ACODE.keys(): | ||
DC_crawling(t, 100) # 100 page |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters