I'm doing something...
I'm doing something...
I'm doing something...
I'm doing something...
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
/var/folders/w6/9k4dzqlj617f06dfby_vk1pr0000gn/T/ipykernel_54398/601997997.py in <module>
15
16 while True:
---> 17 schedule.run_pending() # run_pending:运行所有可以运行的任务
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/schedule/__init__.py in run_pending()
778 :data:`default scheduler instance <default_scheduler>`.
779 """
--> 780 default_scheduler.run_pending()
781
782
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/schedule/__init__.py in run_pending(self)
97 """
98 runnable_jobs = (job for job in self.jobs if job.should_run)
---> 99 for job in sorted(runnable_jobs):
100 self._run_job(job)
101
KeyboardInterrupt:
import schedule
import pandas as pd
from datetime import datetime
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
count = 0
def get_content():
global count # 全局变量count
print('----------- 正在爬取数据 -------------')
url = 'https://s.weibo.com/top/summary?cate=realtimehot&sudaref=s.weibo.com&display=0&retcode=6102'
df = pd.read_html(url)[0][1:11][['序号', '关键词']] # 获取热搜前10
time_ = datetime.now().strftime("%Y/%m/%d %H:%M") # 获取当前时间
df['序号'] = df['序号'].apply(int)
df['热度'] = df['关键词'].str.split(' ', expand=True)[1]
df['关键词'] = df['关键词'].str.split(' ', expand=True)[0]
df['时间'] = [time_] * len(df['序号'])
if count == 0:
df.to_csv('datas.csv', mode='a+', index=False)
count += 1
else:
df.to_csv('datas.csv', mode='a+', index=False, header=False)
# 定时爬虫
schedule.every(1).minutes.do(get_content)
while True:
schedule.run_pending()
----------- 正在爬取数据 -------------
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
/var/folders/w6/9k4dzqlj617f06dfby_vk1pr0000gn/T/ipykernel_46195/500178070.py in <module>
29
30 while True:
---> 31 schedule.run_pending()
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/schedule/__init__.py in run_pending()
778 :data:`default scheduler instance <default_scheduler>`.
779 """
--> 780 default_scheduler.run_pending()
781
782
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/schedule/__init__.py in run_pending(self)
98 runnable_jobs = (job for job in self.jobs if job.should_run)
99 for job in sorted(runnable_jobs):
--> 100 self._run_job(job)
101
102 def run_all(self, delay_seconds: int = 0) -> None:
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/schedule/__init__.py in _run_job(self, job)
170
171 def _run_job(self, job: "Job") -> None:
--> 172 ret = job.run()
173 if isinstance(ret, CancelJob) or ret is CancelJob:
174 self.cancel_job(job)
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/schedule/__init__.py in run(self)
659
660 logger.debug("Running job %s", self)
--> 661 ret = self.job_func()
662 self.last_run = datetime.datetime.now()
663 self._schedule_next_run()
/var/folders/w6/9k4dzqlj617f06dfby_vk1pr0000gn/T/ipykernel_46195/500178070.py in get_content()
12 print('----------- 正在爬取数据 -------------')
13 url = 'https://s.weibo.com/top/summary?cate=realtimehot&sudaref=s.weibo.com&display=0&retcode=6102'
---> 14 df = pd.read_html(url)[0][1:11][['序号', '关键词']] # 获取热搜前10
15 time_ = datetime.now().strftime("%Y/%m/%d %H:%M") # 获取当前时间
16 df['序号'] = df['序号'].apply(int)
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/io/html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1096 io = stringify_path(io)
1097
-> 1098 return _parse(
1099 flavor=flavor,
1100 io=io,
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/io/html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
900 retained = None
901 for flav in flavor:
--> 902 parser = _parser_dispatch(flav)
903 p = parser(io, compiled_match, attrs, encoding, displayed_only)
904
/usr/local/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/io/html.py in _parser_dispatch(flavor)
849 if flavor in ("bs4", "html5lib"):
850 if not _HAS_HTML5LIB:
--> 851 raise ImportError("html5lib not found, please install it")
852 if not _HAS_BS4:
853 raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
ImportError: html5lib not found, please install it