-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
timeface
authored and
timeface
committed
Nov 3, 2018
1 parent
40be29e
commit dc00507
Showing
8 changed files
with
5,196 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from urllib.request import urlopen | ||
from html.parser import HTMLParser | ||
|
||
def isjob(url): | ||
try: | ||
a, b, c, d = url.split('/') | ||
except ValueError: | ||
return False | ||
return a == d == '' and b == 'jobs' and c.isdigit() | ||
|
||
class Scraper(HTMLParser): | ||
in_link = False | ||
def handle_starttag(self, tag, attrs): | ||
attrs = dict(attrs) | ||
url = attrs.get('href', '') | ||
if tag == 'a' and isjob(url): | ||
self.url = url | ||
self.in_link = True | ||
self.chunks = [] | ||
|
||
def handle_data(self, data): | ||
if self.in_link: | ||
self.chunks.append(data) | ||
|
||
def handle_endtag(self, tag): | ||
if tag == 'a' and self.in_link: | ||
print('{} ({})'.format(''.join(self.chunks), self.url)) | ||
self.in_link = False | ||
|
||
|
||
text = urlopen('http://python.org/jobs').read().decode() | ||
parser = Scraper() | ||
parser.feed(text) | ||
parser.close() | ||
|
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from urllib import request | ||
import os | ||
|
||
#web url path | ||
crawl_url_1 = "http://www.baidu.com" | ||
crawl_url_2 = "http://edu.51cto.com" | ||
current_crawl_url = crawl_url_1 | ||
|
||
#os file system path | ||
local_dir = os.getcwd() | ||
file_name = current_crawl_url.split(".")[1] | ||
local_file_path = local_dir + os.sep + 'db' + os.sep + file_name + '.html' | ||
|
||
def urlopen(url, filename): | ||
file = request.urlopen(url) # >>> type(file) <class 'http.client.HTTPResponse'> | ||
file.info() | ||
file.getcode() #http status code | ||
file.geturl() #http request url | ||
data = file.read() # data is of tpye bytes !!!! not str!! | ||
with open(filename, 'wb') as output_file: | ||
output_file.write(data) | ||
|
||
def urlretrieve(url, filename): | ||
request.urlretrieve(url, filename=filename) | ||
request.urlcleanup() #clear the cached data from urlretrieve | ||
|
||
|
||
if "__main__" == __name__: | ||
urlretrieve(current_crawl_url, local_file_path) #自动下载,并且存储为本地文件 | ||
|
||
|
||
|
||
''' | ||
有没有什么像nodejs中 path 模块解析 操作系统文件路径的 python模块啊 path.resolve(), path.join(), glob()什么的 ??? | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from urllib import request, parse | ||
|
||
import os | ||
|
||
#web url path | ||
crawl_url_1 = "http://blog.csdn.net/weiwei_pig/article/details/51178226" | ||
|
||
current_crawl_url = crawl_url_1 | ||
|
||
|
||
#os file system path | ||
local_dir = os.getcwd() | ||
local_file_path_headers = local_dir + os.sep + 'db' + os.sep + "csdn_headers" + '.html' | ||
local_file_path = local_dir + os.sep + 'db' + os.sep + "csdn" + '.html' | ||
|
||
|
||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" | ||
#这个网站,爬取的时候,是首页的数据 | ||
#浏览器打开的是 正常的内容页面 | ||
|
||
|
||
def urlopen(url, filename): | ||
file = request.urlopen(url, timeout=40) # >>> type(file) <class 'http.client.HTTPResponse'> | ||
data = file.read() # data is of tpye bytes !!!! not str!! | ||
with open(filename, 'wb') as output_file: | ||
output_file.write(data) | ||
|
||
|
||
def urlopen_headers(url, filename): | ||
headers_u = ("User-Agent", user_agent) | ||
opener = request.build_opener() | ||
opener.addheaders = [headers_u] | ||
print(opener.addheaders, 'opener.addheaders') | ||
file = opener.open(url) # >>> type(file) <class 'http.client.HTTPResponse'> | ||
data = file.read() # data is of tpye bytes !!!! not str!! | ||
with open(filename, 'wb') as output_file: | ||
output_file.write(data) #clear the cached data from urlretrieve | ||
|
||
def urlopen_request(url): | ||
key = '王镇' | ||
key = request.quote(key) #转义url中的特殊字符 | ||
req = request.Request(url + "?wd=" + key) | ||
req.add_header("User-Agent", user_agent) | ||
file = request.urlopen(req) | ||
data = file.read | ||
|
||
def post_open(url): | ||
post_data = parse.urlencode({ | ||
"name": 'wangzhen', | ||
"pass": '12345' | ||
}).encode("utf-8") | ||
req = request.Request(url, post_data) | ||
req.add_header("User-agent", user_agent) | ||
file = request.urlopen(req) | ||
|
||
if "__main__" == __name__: | ||
|
||
urlopen(current_crawl_url, local_file_path) #自动下载,并且存储为本地文件 | ||
urlopen_headers(current_crawl_url, local_file_path_headers) #自动下载,并且存储为本地文件 | ||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# from socketserver import TCPServer, StreamRequestHandler | ||
# class Handler(StreamRequestHandler): | ||
# def handle(self): | ||
# addr = self.request.getpeername() | ||
# print('Got connection from: ', addr) | ||
# self.wfile.write('Thank you from connecting') | ||
|
||
# server = TCPServer( ("", 5000), Handler) | ||
# server.serve_forever() | ||
|
||
from socketserver import TCPServer, StreamRequestHandler | ||
class Handler(StreamRequestHandler): | ||
def handle(self): | ||
addr = self.request.getpeername() | ||
print('Got connection from', addr) | ||
self.wfile.write('Thank you for connecting') | ||
server = TCPServer(('', 1234), Handler) | ||
server.serve_forever() | ||
|
||
''' | ||
fork | ||
thread | ||
asynchronous I/O | ||
''' |