From ef92ca801e5b9d08910349127b3d96057784ba50 Mon Sep 17 00:00:00 2001 From: jayleenym Date: Tue, 23 Nov 2021 10:03:01 +0900 Subject: [PATCH] =?UTF-8?q?kafka=20producer=EB=A1=9C=20=ED=81=AC=EB=A1=A4?= =?UTF-8?q?=EB=A7=81=20#4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Kafka/DC_test_1122.ipynb | 48 ++++++++ Kafka/dc_crawling.py | 116 ++++++++++++++++++ .../dc_crawling.py" | 41 ++++--- 3 files changed, 188 insertions(+), 17 deletions(-) create mode 100644 Kafka/DC_test_1122.ipynb create mode 100644 Kafka/dc_crawling.py diff --git a/Kafka/DC_test_1122.ipynb b/Kafka/DC_test_1122.ipynb new file mode 100644 index 0000000..afd1787 --- /dev/null +++ b/Kafka/DC_test_1122.ipynb @@ -0,0 +1,48 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import random\n", + "import datetime\n", + "# dc_crawling.py 같은 폴더에 있다고 가정\n", + "from dc_crawling import *\n", + "from kafka import KafkaProducer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bootstrap_servers = ['localhost:9092','localhost:9093'] # kafka broker ip\n", + "topicName = 'DC'\n", + "producer = KafkaProducer(bootstrap_servers = bootstrap_servers)\n", + "\n", + "for t in ACODE.keys():\n", + " dcs = DC_crawling(t, 100)\n", + " for i in dcs:\n", + " print(i)\n", + " producer.send(topicName, str(i).encode())\n", + "\n", + " tim = datetime.date.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", + " producer.send(topicName, tim.encode())\n", + "\n", + " # time.sleep(1)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Kafka/dc_crawling.py b/Kafka/dc_crawling.py new file mode 100644 index 0000000..a7c5a62 --- /dev/null +++ b/Kafka/dc_crawling.py @@ -0,0 +1,116 @@ +import requests +from urllib import request +from bs4 import BeautifulSoup +import time +import pandas as pd +# from pymongo import MongoClient + + +ACODE = {"호텔신라":"008770","두산퓨얼셀":"336260","한화솔루션":"009830", + "에코프로":"086520","신성이엔지":"011930","유니슨":"018000", + "카카오":"035720","네이버":"035420","엔씨소프트":"036570", + "sds":"018260","삼성전자":"005930","SK하이닉스":"000660", + "skc":"011790","한솔케미칼":"014680","DB하이텍":"000990", + "롯데쇼핑":"023530","신세계":"004170","삼성물산":"028260", + "동서":"026960","kb금융":"105560","한국금융지주":"071050","신한지주":"055550", + "우리금융지주":"316140","하나금융지주":"086790"} + + +HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'} +# RESULT = [] +# client = MongoClient('localhost', 27017) +# MongoDB = client['DC'] +# collection = MongoDB['DC_title_crawl'] + +# 멈출 날짜 +# DATE_STOP = '2021-09-20' +DATE_STOP = '2021-08-19' + +def title_crawl(ls, ticker, result): + global DATE_STOP + + for i in range(0,len(ls)-1): + response = requests.get('https://gall.dcinside.com' + str(ls[i]),headers = HEADERS) + soup = BeautifulSoup(response.content,'html.parser') + items = soup.find_all("table",{'class':'gall_list'}) + contents = soup.find('tbody').find_all('tr') + + for j in contents: + if j.find('td',{'class':'gall_writer ub-writer'}).text=='운영자': + pass + else: + # 날짜 + date_dict = j.find('td',{'class':'gall_date'}).attrs + + if date_dict['title'][:10] <= DATE_STOP: + return -1 + else: + # Date.append(date_dict['title']) + # 제목 + title = j.find('a').text + # 추천수 + recommend_tag = j.find('td', class_='gall_recommend') + recommend = recommend_tag.text + # Rec.append(recommend) + + # 조회수 + views_tag = j.find('td', class_='gall_count') + views = views_tag.text + # View.append(views) + + put_data = { + 'code': ACODE[ticker], + 'date' : date_dict['title'], + 'title' : title, + 'view' : views, + 'recommend' : recommend + } + + result.append(put_data) + # Mongodb에 넣는 코드 + # MongoDB.DC_title_crawl.insert_one(put_data) + + +# keyword 바꾸기 +def DC_crawling(ticker, page): + RESULT = [] + url = f"https://gall.dcinside.com/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}" + + a = f'/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}' + response = requests.get(url,headers=HEADERS) + soup = BeautifulSoup(response.text,'html.parser') + items = soup.find('div',{'class':"bottom_paging_box"}) + + #리스트 만들기 url + url_list=[a] + for i in items.find_all('a'): + url_url = i['href'] + url_list.append(url_url) + + # 첫페이지 + title_crawl(url_list, ticker, RESULT) + + k = 0 + while k < page : + response = requests.get('https://gall.dcinside.com' + str(url_list[-1]),headers=HEADERS) + soup = BeautifulSoup(response.content,'html.parser') + items = soup.find('div',{'class':"bottom_paging_box"}) + url_list=[str(url_list[-1])] + + for i in items.find_all('a'): + url_url = i['href'] + url_list.append(url_url) + url_list.pop(1) + + if title_crawl(url_list, ticker) == -1: + break + else: + k += 1 + + return RESULT + + + +if __name__ == "__main__": + for t in ACODE.keys(): + DC_crawling(t, 100) # 100 page diff --git "a/crawling/dc\352\260\244\353\237\254\353\246\254/dc_crawling.py" "b/crawling/dc\352\260\244\353\237\254\353\246\254/dc_crawling.py" index 41e3e03..a7c5a62 100644 --- "a/crawling/dc\352\260\244\353\237\254\353\246\254/dc_crawling.py" +++ "b/crawling/dc\352\260\244\353\237\254\353\246\254/dc_crawling.py" @@ -3,7 +3,7 @@ from bs4 import BeautifulSoup import time import pandas as pd -from pymongo import MongoClient +# from pymongo import MongoClient ACODE = {"호텔신라":"008770","두산퓨얼셀":"336260","한화솔루션":"009830", @@ -15,16 +15,20 @@ "동서":"026960","kb금융":"105560","한국금융지주":"071050","신한지주":"055550", "우리금융지주":"316140","하나금융지주":"086790"} + HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'} +# RESULT = [] +# client = MongoClient('localhost', 27017) +# MongoDB = client['DC'] +# collection = MongoDB['DC_title_crawl'] -client = MongoClient('localhost', 27017) -MongoDB = client['DC'] -collection = MongoDB['DC_title_crawl'] +# 멈출 날짜 +# DATE_STOP = '2021-09-20' +DATE_STOP = '2021-08-19' -#멈출 날짜 -DATE_STOP = '2021-09-20' +def title_crawl(ls, ticker, result): + global DATE_STOP -def title_crawl(ls, ticker): for i in range(0,len(ls)-1): response = requests.get('https://gall.dcinside.com' + str(ls[i]),headers = HEADERS) soup = BeautifulSoup(response.content,'html.parser') @@ -35,40 +39,41 @@ def title_crawl(ls, ticker): if j.find('td',{'class':'gall_writer ub-writer'}).text=='운영자': pass else: - #날짜 + # 날짜 date_dict = j.find('td',{'class':'gall_date'}).attrs if date_dict['title'][:10] <= DATE_STOP: return -1 else: # Date.append(date_dict['title']) - #제목 + # 제목 title = j.find('a').text - #추천수 + # 추천수 recommend_tag = j.find('td', class_='gall_recommend') recommend = recommend_tag.text # Rec.append(recommend) - #조회수 + # 조회수 views_tag = j.find('td', class_='gall_count') views = views_tag.text # View.append(views) put_data = { 'code': ACODE[ticker], - 'title' : title, 'date' : date_dict['title'], + 'title' : title, 'view' : views, 'recommend' : recommend } + result.append(put_data) # Mongodb에 넣는 코드 - MongoDB.DC_title_crawl.insert_one(put_data) + # MongoDB.DC_title_crawl.insert_one(put_data) # keyword 바꾸기 -def DC(ticker, page): - +def DC_crawling(ticker, page): + RESULT = [] url = f"https://gall.dcinside.com/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}" a = f'/board/lists?id=neostock&s_type=search_subject_memo&s_keyword={ticker}' @@ -83,7 +88,7 @@ def DC(ticker, page): url_list.append(url_url) # 첫페이지 - title_crawl(url_list, ticker) + title_crawl(url_list, ticker, RESULT) k = 0 while k < page : @@ -102,8 +107,10 @@ def DC(ticker, page): else: k += 1 + return RESULT + if __name__ == "__main__": for t in ACODE.keys(): - DC(t, 40) + DC_crawling(t, 100) # 100 page