Skip to content

Commit

Permalink
kafka producer로 크롤링 #4
Browse files Browse the repository at this point in the history
  • Loading branch information
jayleenym committed Nov 23, 2021
1 parent c8b6a66 commit ef92ca8
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 17 deletions.
48 changes: 48 additions & 0 deletions Kafka/DC_test_1122.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import random\n",
"import datetime\n",
"# dc_crawling.py 같은 폴더에 있다고 가정\n",
"from dc_crawling import *\n",
"from kafka import KafkaProducer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bootstrap_servers = ['localhost:9092','localhost:9093'] # kafka broker ip\n",
"topicName = 'DC'\n",
"producer = KafkaProducer(bootstrap_servers = bootstrap_servers)\n",
"\n",
"for t in ACODE.keys():\n",
" dcs = DC_crawling(t, 100)\n",
" for i in dcs:\n",
" print(i)\n",
" producer.send(topicName, str(i).encode())\n",
"\n",
" tim = datetime.date.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
" producer.send(topicName, tim.encode())\n",
"\n",
" # time.sleep(1)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
116 changes: 116 additions & 0 deletions Kafka/dc_crawling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import requests
from urllib import request
from bs4 import BeautifulSoup
import time
import pandas as pd
# from pymongo import MongoClient


ACODE = {"호텔신라":"008770","두산퓨얼셀":"336260","한화솔루션":"009830",
"에코프로":"086520","신성이엔지":"011930","유니슨":"018000",
"카카오":"035720","네이버":"035420","엔씨소프트":"036570",
"sds":"018260","삼성전자":"005930","SK하이닉스":"000660",
"skc":"011790","한솔케미칼":"014680","DB하이텍":"000990",
"롯데쇼핑":"023530","신세계":"004170","삼성물산":"028260",
"동서":"026960","kb금융":"105560","한국금융지주":"071050","신한지주":"055550",
"우리금융지주":"316140","하나금융지주":"086790"}


HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
# RESULT = []
# client = MongoClient('localhost', 27017)
# MongoDB = client['DC']
# collection = MongoDB['DC_title_crawl']

# 멈출 날짜
# DATE_STOP = '2021-09-20'
DATE_STOP = '2021-08-19'

def title_crawl(ls, ticker, result):
global DATE_STOP

for i in range(0,len(ls)-1):
response = requests.get('https://gall.dcinside.com' + str(ls[i]),headers = HEADERS)
soup = BeautifulSoup(response.content,'html.parser')
items = soup.find_all("table",{'class':'gall_list'})
contents = soup.find('tbody').find_all('tr')

for j in contents:
if j.find('td',{'class':'gall_writer ub-writer'}).text=='운영자':
pass
else:
# 날짜
date_dict = j.find('td',{'class':'gall_date'}).attrs

if date_dict['title'][:10] <= DATE_STOP:
return -1
else:
# Date.append(date_dict['title'])
# 제목
title = j.find('a').text
# 추천수
recommend_tag = j.find('td', class_='gall_recommend')
recommend = recommend_tag.text
# Rec.append(recommend)

# 조회수
views_tag = j.find('td', class_='gall_count')
views = views_tag.text
# View.append(views)

put_data = {
'code': ACODE[ticker],
'date' : date_dict['title'],
'title' : title,
'view' : views,
'recommend' : recommend
}

result.append(put_data)
# Mongodb에 넣는 코드
# MongoDB.DC_title_crawl.insert_one(put_data)


# keyword 바꾸기
def DC_crawling(ticker, page):
RESULT = []
url = f"https://gall.dcinside.com/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}"

a = f'/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}'
response = requests.get(url,headers=HEADERS)
soup = BeautifulSoup(response.text,'html.parser')
items = soup.find('div',{'class':"bottom_paging_box"})

#리스트 만들기 url
url_list=[a]
for i in items.find_all('a'):
url_url = i['href']
url_list.append(url_url)

# 첫페이지
title_crawl(url_list, ticker, RESULT)

k = 0
while k < page :
response = requests.get('https://gall.dcinside.com' + str(url_list[-1]),headers=HEADERS)
soup = BeautifulSoup(response.content,'html.parser')
items = soup.find('div',{'class':"bottom_paging_box"})
url_list=[str(url_list[-1])]

for i in items.find_all('a'):
url_url = i['href']
url_list.append(url_url)
url_list.pop(1)

if title_crawl(url_list, ticker) == -1:
break
else:
k += 1

return RESULT



if __name__ == "__main__":
for t in ACODE.keys():
DC_crawling(t, 100) # 100 page
41 changes: 24 additions & 17 deletions crawling/dc갤러리/dc_crawling.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from bs4 import BeautifulSoup
import time
import pandas as pd
from pymongo import MongoClient
# from pymongo import MongoClient


ACODE = {"호텔신라":"008770","두산퓨얼셀":"336260","한화솔루션":"009830",
Expand All @@ -15,16 +15,20 @@
"동서":"026960","kb금융":"105560","한국금융지주":"071050","신한지주":"055550",
"우리금융지주":"316140","하나금융지주":"086790"}


HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
# RESULT = []
# client = MongoClient('localhost', 27017)
# MongoDB = client['DC']
# collection = MongoDB['DC_title_crawl']

client = MongoClient('localhost', 27017)
MongoDB = client['DC']
collection = MongoDB['DC_title_crawl']
# 멈출 날짜
# DATE_STOP = '2021-09-20'
DATE_STOP = '2021-08-19'

#멈출 날짜
DATE_STOP = '2021-09-20'
def title_crawl(ls, ticker, result):
global DATE_STOP

def title_crawl(ls, ticker):
for i in range(0,len(ls)-1):
response = requests.get('https://gall.dcinside.com' + str(ls[i]),headers = HEADERS)
soup = BeautifulSoup(response.content,'html.parser')
Expand All @@ -35,40 +39,41 @@ def title_crawl(ls, ticker):
if j.find('td',{'class':'gall_writer ub-writer'}).text=='운영자':
pass
else:
#날짜
# 날짜
date_dict = j.find('td',{'class':'gall_date'}).attrs

if date_dict['title'][:10] <= DATE_STOP:
return -1
else:
# Date.append(date_dict['title'])
#제목
# 제목
title = j.find('a').text
#추천수
# 추천수
recommend_tag = j.find('td', class_='gall_recommend')
recommend = recommend_tag.text
# Rec.append(recommend)

#조회수
# 조회수
views_tag = j.find('td', class_='gall_count')
views = views_tag.text
# View.append(views)

put_data = {
'code': ACODE[ticker],
'title' : title,
'date' : date_dict['title'],
'title' : title,
'view' : views,
'recommend' : recommend
}

result.append(put_data)
# Mongodb에 넣는 코드
MongoDB.DC_title_crawl.insert_one(put_data)
# MongoDB.DC_title_crawl.insert_one(put_data)


# keyword 바꾸기
def DC(ticker, page):

def DC_crawling(ticker, page):
RESULT = []
url = f"https://gall.dcinside.com/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}"

a = f'/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}'
Expand All @@ -83,7 +88,7 @@ def DC(ticker, page):
url_list.append(url_url)

# 첫페이지
title_crawl(url_list, ticker)
title_crawl(url_list, ticker, RESULT)

k = 0
while k < page :
Expand All @@ -102,8 +107,10 @@ def DC(ticker, page):
else:
k += 1

return RESULT



if __name__ == "__main__":
for t in ACODE.keys():
DC(t, 40)
DC_crawling(t, 100) # 100 page

0 comments on commit ef92ca8

Please sign in to comment.