-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathsync_trending.py
167 lines (139 loc) · 5.14 KB
/
sync_trending.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
同步github trending到notion
"""
import logging
import time
import requests
from pyquery import PyQuery as pq
from github import Github
from github import Auth
from notion_client import Client
from config import CONFIG
class TrendItem:
"""trend item"""
def __init__(self, title: str, url: str, desc: str) -> None:
self.title = title
self.url = url
self.desc = desc
self.watchers_count = 0
self.forks_count = 0
self.stargazers_count = 0
def _repo_path(self) -> str:
items = self.url.split('/')[-2:]
return '/'.join(items)
def fullfill_repo_info(self, git_token):
"fullfill basic info from repo"
if not self.url:
return
auth = None
if git_token:
auth = Auth.Token(git_token)
git = Github(auth=auth)
try:
repo = git.get_repo(self._repo_path())
# pylint: disable-next=broad-except
except Exception as _e:
logging.error("get repo %s error: %s", self._repo_path() , _e)
return
self.watchers_count = repo.watchers_count
self.forks_count = repo.forks_count
self.stargazers_count = repo.stargazers_count
def query_page(client: Client, database_id: str, title: str) -> bool:
"""检查是否已经插入过 如果已经插入了就忽略"""
time.sleep(0.3)
response = client.databases.query(
database_id=database_id,
filter={
"property": "Title",
"rich_text": {
"equals": title
}
})
if len(response["results"]):
return True
return False
def insert_page(client: Client, database_id: str, language: str, trend: TrendItem) -> None | str:
'''插入page'''
parent = {
"database_id": database_id,
"type": "database_id"
}
properties = {
"Title": {"title": [{"type": "text", "text": {"content": trend.title}}]},
"Language": {"select": {"name": language}},
"URL": {"url": trend.url},
"Desc": {"rich_text": [{"type": "text", "text": {"content": trend.desc}}]},
"WatchersCount": {"number": trend.watchers_count},
"ForksCount": {"number": trend.forks_count},
"StargazersCount": {"number": trend.stargazers_count},
}
response = client.pages.create(parent=parent, properties=properties)
return response["id"]
def _scrape(language: str) -> list[TrendItem]:
headers = {
# pylint: disable=line-too-long
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding' : 'gzip,deflate,sdch',
'Accept-Language' : 'zh-CN,zh;q=0.8'
}
result = []
url = f'https://github.com/trending/{language}'.format(language=language)
req = requests.get(url, headers=headers)
if req.status_code != 200:
logging.error("git trending error. %d", req.status_code)
return
content = pq(req.content)
items = content('div.Box article.Box-row')
# codecs to solve the problem utf-8 codec like chinese
# with codecs.open(filename, "a", "utf-8") as f:
# # f.write('\n#### {language}\n'.format(language=language))
for item in items:
i = pq(item)
title = i(".lh-condensed a").text()
description = i("p.col-9").text()
url = "https://github.com" + i(".lh-condensed a").attr("href")
result.append(TrendItem(title, url, description))
return result
def _filter_repo(trend: TrendItem) -> bool:
filters = {
'MinStargazers': 'stargazers_count',
'MinForks': 'forks_count',
'MinWatchers': 'watchers_count',
}
for k, v in filters.items():
thresh_hold = CONFIG.getint("trending.language", k)
current = getattr(trend, v, 0)
if thresh_hold > 0 and current < thresh_hold:
return True
return False
# pylint: disable=line-too-long
def _sync(client: Client, database_id: str, language: str, trends: list[TrendItem], git_token: str) -> None:
for trend in trends:
time.sleep(0.3) # avoid rate limit for notion API
if query_page(client, database_id, trend.title):
continue
# insert to db
logging.info(trend)
if git_token:
trend.fullfill_repo_info(git_token)
if _filter_repo(trend):
logging.info("ignore %s", trend.title)
continue
insert_page(client, database_id, language, trend)
def sync_trending(notion_token, database_id, git_token=None):
"""sync github trending to notion"""
client = Client(
auth=notion_token,
log_level=logging.ERROR
)
languages = list(map(lambda x: x.strip(), CONFIG.get("trending.language", "Languages").split(",")))
for language in languages:
if not language:
continue
logging.info("sync %s", language)
trends = _scrape(language)
if not trends:
logging.error("language [%s] error", language)
continue
_sync(client, database_id, language, trends, git_token)