Skip to content

Commit

Permalink
create
Browse files Browse the repository at this point in the history
  • Loading branch information
QiuBingCheng committed Jan 9, 2022
1 parent c301714 commit e04a28d
Show file tree
Hide file tree
Showing 44 changed files with 128,543 additions and 0 deletions.
133 changes: 133 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# reference
Reference/
data/
24 changes: 24 additions & 0 deletions Crawler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Crawler
This is a repository for crawlers.

## news_crawler
Enter specific keywords to crawl google news

### Getting started

```python
from news_crawler import Crawler
c = Crawler()
c.crawl_news("華碩", pages=2,timeout=3, time_sleep=5 )
c.get_data()
```

![](<https://github.com/vbjc5275/Crawler/blob/master/image/news-crawler.jpg>)


## foodpanda_crawler

![](<https://github.com/vbjc5275/Crawler/blob/master/image/vendor_info.png>)
![](<https://github.com/vbjc5275/Crawler/blob/master/image/vendor_menu.png>)

## options crawler
150 changes: 150 additions & 0 deletions Crawler/foodpanda_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 2 19:44:21 2020
@author: Jerry
"""


# =============================================================================
# import
# =============================================================================
import requests
from bs4 import BeautifulSoup,element
import pandas as pd
from re import search
import time
# =============================================================================
# find the homepage of each city
# =============================================================================
HOME_URL = "https://www.foodpanda.com.tw"

def get_all_city_link():
"""取得所有城市的連結"""
response = requests.get(HOME_URL)
soup = BeautifulSoup(response.text)
all_a = soup.find_all("a",class_="city-tile")
all_link = [HOME_URL+a.get("href") for a in all_a ]

return all_link

# =============================================================================
# foodpanda-find all vendor
# =============================================================================
#Take Taipei as an example
def get_restaurant_info(url):
response = requests.get(url)
soup = BeautifulSoup(response.text)
all_li = soup.find("ul",class_="vendor-list").children

all_restaurant = []
for v in all_li:
if isinstance(v,element.Tag):
all_restaurant.append(v)

restaurants_info = []
for restaurant in all_restaurant:
v = {}
#restaurant name
v["name"] = restaurant.find("span",class_="name fn").text
#restaurant link
v["link"] = HOME_URL+restaurant.find("a").get('href')
#restaurant photo
pic_url = restaurant.find("div").get("data-src")
v["pic_url"] = pic_url[:pic_url.find("?")]

#restaurant star
try:
v["rating"] = restaurant.find("span",class_="rating").find("strong").text
v["count"] = restaurant.find("span",class_="count").text.strip()
except:
v["rating"] = "NA"
v["count"] = 0

try:
v["tag"] = restaurant.find("span",class_="multi-tag").text
except:
v["tag"] = "NA"

restaurants_info.append(v)
return pd.DataFrame(restaurants_info)


cities_link = get_all_city_link()
restaurants_info = get_restaurant_info(cities_link[0])
restaurants_info["id"] = [i for i in range(1,len(restaurants_info)+1)]
# =============================================================================
# foodpanda - find menu
# =============================================================================
def get_restaurant_menu(url):
"""return the menu of the restaurant"""
menu = []
response = requests.get(url)
soup = BeautifulSoup(response.text)
dishes_section = soup.find_all("div",class_="dish-category-section")
for section in dishes_section:
#restaurant menu-category
category = section.find("h2",class_="dish-category-title").text
dishes = section.find("ul",class_="dish-list").find_all("li")
for dish in dishes:
m = {}
m["category"] = category
#dish name
m["name"] = dish.find("h3").find("span").text
#dish description
try:
m["description"] = dish.find("p").text.strip()
except:
m["description"] = "NA"
#dis price
m["price"] = dish.find("span",class_="price p-price").text.strip()
menu.append(m)
return menu

res_url = restaurants_info.link[0]
menu = get_restaurant_menu(res_url)

#get all restaurant menu
sleep = 0.5
menu_list = []
for id_, link in zip(restaurants_info["id"].values,
restaurants_info["link"].values):
print(link)
menu = get_restaurant_menu(link)
#add restaurant ID
for i in range(len(menu)):
menu[i]["id"] = id_
menu_list.extend(menu)
time.sleep(sleep)

restaurants_menu = pd.DataFrame(menu_list)

# =============================================================================
# clean data
# =============================================================================
def extract_number(str_num):
pattern = "[\d,]+.[\d]{2}"
return search(pattern, str_num).group(0)

restaurants_menu.price = restaurants_menu.price.apply(extract_number)
restaurants_menu.price = restaurants_menu.price.apply(lambda price:price.replace(",",""))

#remove unnessary category
unwanted_category = ["注意事項","營養標示"]
unwanted_index = []
for category in unwanted_category:
index = restaurants_menu[restaurants_menu.category.str.contains(category)].index
if len(index)>1:
unwanted_index.extend(index)
restaurants_menu.drop(unwanted_index, axis=0,inplace=True)

restaurants_info.count = restaurants_info["count"].astype(int)
restaurants_menu.price = restaurants_menu["price"].astype(float)
# =============================================================================
# save data
# =============================================================================
path = "food-panda.xlsx"
with pd.ExcelWriter(path) as writer:
restaurants_info.to_excel(writer,sheet_name="店家評分",index=False)
restaurants_menu.to_excel(writer,sheet_name="店家菜單",index=False)

23 changes: 23 additions & 0 deletions Crawler/get proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 20 22:50:43 2020
@author: Jerry
"""
from bs4 import BeautifulSoup
import requests
def get_soup(url):
proxies = {
"http": "40.121.198.48:80",
"https": "69.195.157.162:8100"
}
#headers = {'user-agent': UA.random}
response = requests.get(url,proxies=proxies)
soup = BeautifulSoup(response.text,features="html.parser")
response.close()
return soup

url = "https://medium.com/qiubingcheng/%E5%A6%82%E4%BD%95%E5%AE%89%E8%A3%9Danaconda-%E4%B8%A6%E4%B8%94%E5%8C%AF%E5%85%A5conda%E8%99%9B%E6%93%AC%E7%92%B0%E5%A2%83-ba2e140706a3"
soup = get_soup(url)

requests.get(url)
Binary file added Crawler/image/news-crawler.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Crawler/image/vendor_info.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Crawler/image/vendor_menu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit e04a28d

Please sign in to comment.