Skip to content

Commit

Permalink
chore: 更新文章爬虫
Browse files Browse the repository at this point in the history
  • Loading branch information
WAMaker committed Sep 13, 2019
1 parent 8cfd71e commit 38a66bb
Show file tree
Hide file tree
Showing 9 changed files with 148 additions and 54 deletions.
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ django = "*"
djangorestframework = "*"
markdown = "*"
django-filter = "*"
"beautifulsoup4" = "*"
beautifulsoup4 = "*"
lxml = "*"
requests = "*"
gunicorn = "*"
Expand Down
18 changes: 9 additions & 9 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

112 changes: 74 additions & 38 deletions glaw/app/core/post_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
import re
import json
import requests
from bs4 import BeautifulSoup
from markdown import markdown

POST_LIST_API_URL = "https://api.github.com/repos/SwiftGGTeam/source/contents/_posts?ref=master"

TEST_TOKEN = "adc33ddb0505ece9a9fdb3b58327871fa49c6a15"
TEST_TOKEN = ""


def excutor_post() -> list:
def executor_post() -> list:
# TEST_TOKEN = input("private token: ")
# 请求列表
headers = {
Expand Down Expand Up @@ -39,7 +41,8 @@ def excutor_post() -> list:

def resolve_header(raw: str) -> dict:
# 模拟 raw 文件
# r = requests.get("https://raw.githubusercontent.com/SwiftGGTeam/source/master/_posts/20151029_list-comprehensions-and-performance-with-swift.md")
# r = requests.get("https://raw.githubusercontent.com/SwiftGGTeam/source/master/_posts/20190709_sets-in-swift.md")
# r = requests.get("https://raw.githubusercontent.com/SwiftGGTeam/source/master/_posts/20190415_core-bluetooth.md")
# raw = str(r.content, encoding="utf-8")
info = {}
preface = ''
Expand All @@ -50,22 +53,25 @@ def resolve_header(raw: str) -> dict:
continue
# print(line)
# 匹配 title
title_re_test = re.match(r'^title:.*?"(.+?)".*?$', line)
title_re_test = re.match(r'^.*?title:.*?"(.+?)".*?$', line)
title_re_test2 = re.match(r'^.*?title:(.+?)$', line)
if title_re_test:
info['title'] = title_re_test.group(1)
elif title_re_test2:
info['title'] = title_re_test2.group(1).strip()

# 匹配时间
date_re_test = re.match(r'^date:.*?(\d{4}-\d{1,2}-\d{1,2}).*?$', line)
date_re_test = re.match(r'^.*?date:.*?(\d{4}-\d{1,2}-\d{1,2}).*?$', line)
if date_re_test:
info['date'] = date_re_test.group(1)

# 匹配 permalink
category_re_test = re.match(r'^permalink:(.+)$', line)
category_re_test = re.match(r'^.*?permalink:(.+)$', line)
if category_re_test:
info['permalink'] = category_re_test.group(1).strip()

# 匹配 categories
categories_re_test = re.match(r'^categories:.*?\[(.*?)\].*?$', line)
categories_re_test = re.match(r'^.*?categories:.*?\[(.*?)\].*?$', line)
if categories_re_test:
categories = categories_re_test.group(1).split(',')
if len(categories) > 0 and len(categories[0]) > 0:
Expand All @@ -74,7 +80,13 @@ def resolve_header(raw: str) -> dict:
# 匹配 more
if re.match(r'^\s*<\s*!-+\s*more\s*-+\s*>\s*$', line):
# 移除最后一个 \n
info['preface'] = preface[:-1]
text, url = get_preface_and_image_url(preface[:-1])
print(text)
info['preface'] = text
if url.startswith('/'):
url = 'https://swift.gg' + url
print("updated final url: " + url)
info['thumbnail'] = url
break

if preface_start:
Expand All @@ -97,6 +109,30 @@ def resolve_header(raw: str) -> dict:
return info


# 从字符串 text 中获取所有 image 标签里的 url
def get_image_urls(text):
pattern = re.compile(r'(?:!\[.*\]\((.*?)\))')
urls = pattern.findall(text)
return urls


# 从输入的 Markdown 文件里获取无格式的前言和前言中的图片 url
def get_preface_and_image_url(preface: str):
# 从 rawPreface 里获取 url,只要第一个
urls = get_image_urls(preface)
for url in urls:
print('[url]:', url)
image_url = ''
if len(urls) > 0:
image_url = urls[0]

# rawPreface 去掉 Markdown 标签
html = markdown(preface)
preface = ''.join(BeautifulSoup(html, 'html.parser').findAll(text=True))
preface = preface.replace('\n', '')
return preface, image_url


def resolve_body(raw) -> dict:
# 模拟 raw 文件
# r = requests.get("https://raw.githubusercontent.com/SwiftGGTeam/source/master/_posts/20151029_list-comprehensions-and-performance-with-swift.md")
Expand Down Expand Up @@ -127,44 +163,42 @@ def ptr_post_dict(post: dict):
print('category: %s' % post['category'])


def bulk_insert(res):
flags = {}
list_to_insert = list()
for post_dic in res:
if not 'title' in post_dic.keys():
continue
if post_dic['title'] in flags.keys():
continue
flags[post_dic['title']] = True
list_to_insert.append(make_post(post_dic))
# BD
Post.objects.bulk_create(list_to_insert)


def bulk_update(res):
def bulk_update(posts: list):
"""
爬虫更新策略
:param res: 传入 excutor_post 爬虫结果
:param posts: 传入 executor_post 爬虫结果
:return:
"""
flags = {}
list_to_insert = list()
list_to_update = list()
for post_dic in res:
if not 'title' in post_dic.keys():
continue
if post_dic['title'] in flags.keys():
continue
flags[post_dic['title']] = True
list_to_update.append(make_post(post_dic))
Post.objects.bulk_update(list_to_update, ['body', 'category', 'preface'])

inserted_titles = set()

for post_dict in posts:
if 'title' in post_dict.keys():
title = post_dict['title']
try:
post = Post.objects.get(title=title)
update(post, post_dict)
list_to_update.append(post)
except Post.DoesNotExist:
if title in inserted_titles:
print('existed title: %s' % title)
continue

inserted_titles.add(title)
post = Post(title=title)
update(post, post_dict)
list_to_insert.append(post)
else:
print('match failed:')
print(post_dict)

Post.objects.bulk_create(list_to_insert)
Post.objects.bulk_update(list_to_update, ['preface', 'thumbnail'])

def make_post(post_dict: dict):
try:
post = Post.objects.get(title=post_dict['title'])
except Post.DoesNotExist:
post = Post(title=post_dict['title'])

def update(post: Post, post_dict: dict):
if 'date' in post_dict.keys():
post.published_at = datetime.strptime(post_dict['date'], "%Y-%m-%d")
if 'html_url' in post_dict.keys():
Expand All @@ -178,6 +212,8 @@ def make_post(post_dict: dict):
post.category = category
if 'preface' in post_dict.keys():
post.preface = post_dict['preface']
if 'thumbnail' in post_dict.keys():
post.thumbnail = post_dict['thumbnail']

post.body = post_dict['body']
return post
Expand Down
43 changes: 43 additions & 0 deletions glaw/app/migrations/0005_auto_20190913_0843.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Generated by Django 2.2.5 on 2019-09-13 08:43

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('app', '0004_product'),
]

operations = [
migrations.AddField(
model_name='post',
name='thumbnail',
field=models.URLField(blank=True, max_length=128, null=True, verbose_name='缩略图'),
),
migrations.AddField(
model_name='product',
name='is_available',
field=models.BooleanField(default=True, verbose_name='商品是否有效'),
),
migrations.AddField(
model_name='product',
name='is_published',
field=models.BooleanField(default=False, verbose_name='商品是否已发布'),
),
migrations.AddField(
model_name='product',
name='source',
field=models.CharField(blank=True, max_length=56, null=True, verbose_name='商品来源'),
),
migrations.AddField(
model_name='product',
name='unavailable_reason',
field=models.CharField(blank=True, max_length=128, null=True, verbose_name='商品无效原因'),
),
migrations.AlterField(
model_name='product',
name='preface',
field=models.CharField(blank=True, max_length=256, null=True, verbose_name='简介'),
),
]
1 change: 1 addition & 0 deletions glaw/app/models/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Post(models.Model):
origin_title = models.CharField("原文标题", max_length=128, null=True, blank=True, unique=True)
body = models.TextField("内容")
preface = models.TextField("前言", null=True, blank=True)
thumbnail = models.URLField(verbose_name="缩略图", max_length=128, blank=True, null=True)

# 原文作者
author = models.CharField(verbose_name="作者", max_length=128)
Expand Down
12 changes: 6 additions & 6 deletions glaw/app/models/product.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ class Product(models.Model):
origin_price = models.IntegerField(verbose_name="原价", blank=False, null=False)
banner_display = models.BooleanField(verbose_name="是否展示在 banner 位")

is_available = models.BooleanField(verbose_name="商品是否有效")
is_published = models.BooleanField(verbose_name="商品是否已发布")
unavailable_reason = models.CharField("商品无效原因", max_length=128)
is_available = models.BooleanField(verbose_name="商品是否有效", default=True)
is_published = models.BooleanField(verbose_name="商品是否已发布", default=False)
unavailable_reason = models.CharField("商品无效原因", max_length=128, blank=True, null=True)

source = models.CharField("商品来源", max_length=56)
preface = models.CharField("简介", max_length=256)
body = models.TextField("内容", null=False)
source = models.CharField("商品来源", max_length=56, blank=True, null=True)
preface = models.CharField("简介", max_length=256, blank=True, null=True)
body = models.TextField("内容")

thumbnail_url = models.URLField(verbose_name="缩略图链接", max_length=128, blank=False, null=False)
purchase_url = models.URLField(verbose_name="购买链接", max_length=128, blank=False, null=False)
Expand Down
2 changes: 2 additions & 0 deletions glaw/app/serializers/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
class PostListSerializer(serializers.ModelSerializer):
category = serializers.StringRelatedField(many=False)
publishDate = serializers.DateTimeField(source='published_at', format="%Y-%m-%d")
imageURL = serializers.URLField(source='thumbnail')

class Meta:
model = Post
fields = (
'id',
'title',
'preface',
'imageURL',
'category',
'publishDate'
)
Expand Down
1 change: 1 addition & 0 deletions glaw/app/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@
urlpatterns = [
re_path(r'^app/posts$', views.query_posts),
re_path(r'^app/post/(?P<post_id>\d+)$', views.query_post),
# re_path(r'^crawl/posts$', views.crawl_post)
]
11 changes: 11 additions & 0 deletions glaw/app/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from rest_framework.response import Response
from rest_framework.decorators import action, api_view, permission_classes, parser_classes

from app.core import post_crawl

# Create your views here.


Expand Down Expand Up @@ -43,6 +45,15 @@ def query_post(request, post_id):
return Response(render_failure('404 Not Found'), status=status.HTTP_404_NOT_FOUND)


@api_view(['GET'])
@permission_classes((permissions.AllowAny, ))
@parser_classes((JSONParser,))
def crawl_post(request):
res = post_crawl.executor_post()
post_crawl.bulk_update(res)
return Response(render_success(res), status=status.HTTP_200_OK)


def render_page_resp(page, limit, total, items):
return {
'pageBean': {
Expand Down

0 comments on commit 38a66bb

Please sign in to comment.