Skip to content

Commit

Permalink
urllib学习
Browse files Browse the repository at this point in the history
  • Loading branch information
timeface authored and timeface committed Nov 3, 2018
1 parent 40be29e commit dc00507
Show file tree
Hide file tree
Showing 8 changed files with 5,196 additions and 0 deletions.
35 changes: 35 additions & 0 deletions Splider/htmlParser/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from urllib.request import urlopen
from html.parser import HTMLParser

def isjob(url):
try:
a, b, c, d = url.split('/')
except ValueError:
return False
return a == d == '' and b == 'jobs' and c.isdigit()

class Scraper(HTMLParser):
in_link = False
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
url = attrs.get('href', '')
if tag == 'a' and isjob(url):
self.url = url
self.in_link = True
self.chunks = []

def handle_data(self, data):
if self.in_link:
self.chunks.append(data)

def handle_endtag(self, tag):
if tag == 'a' and self.in_link:
print('{} ({})'.format(''.join(self.chunks), self.url))
self.in_link = False


text = urlopen('http://python.org/jobs').read().decode()
parser = Scraper()
parser.feed(text)
parser.close()

Binary file added Splider/urllib/.DS_Store
Binary file not shown.
2,533 changes: 2,533 additions & 0 deletions Splider/urllib/db/csdn.html

Large diffs are not rendered by default.

2,507 changes: 2,507 additions & 0 deletions Splider/urllib/db/csdn_headers.html

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions Splider/urllib/first.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from urllib import request
import os

#web url path
crawl_url_1 = "http://www.baidu.com"
crawl_url_2 = "http://edu.51cto.com"
current_crawl_url = crawl_url_1

#os file system path
local_dir = os.getcwd()
file_name = current_crawl_url.split(".")[1]
local_file_path = local_dir + os.sep + 'db' + os.sep + file_name + '.html'

def urlopen(url, filename):
file = request.urlopen(url) # >>> type(file) <class 'http.client.HTTPResponse'>
file.info()
file.getcode() #http status code
file.geturl() #http request url
data = file.read() # data is of tpye bytes !!!! not str!!
with open(filename, 'wb') as output_file:
output_file.write(data)

def urlretrieve(url, filename):
request.urlretrieve(url, filename=filename)
request.urlcleanup() #clear the cached data from urlretrieve


if "__main__" == __name__:
urlretrieve(current_crawl_url, local_file_path) #自动下载,并且存储为本地文件



'''
有没有什么像nodejs中 path 模块解析 操作系统文件路径的 python模块啊 path.resolve(), path.join(), glob()什么的 ???
'''
62 changes: 62 additions & 0 deletions Splider/urllib/headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from urllib import request, parse

import os

#web url path
crawl_url_1 = "http://blog.csdn.net/weiwei_pig/article/details/51178226"

current_crawl_url = crawl_url_1


#os file system path
local_dir = os.getcwd()
local_file_path_headers = local_dir + os.sep + 'db' + os.sep + "csdn_headers" + '.html'
local_file_path = local_dir + os.sep + 'db' + os.sep + "csdn" + '.html'


user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
#这个网站,爬取的时候,是首页的数据
#浏览器打开的是 正常的内容页面


def urlopen(url, filename):
file = request.urlopen(url, timeout=40) # >>> type(file) <class 'http.client.HTTPResponse'>
data = file.read() # data is of tpye bytes !!!! not str!!
with open(filename, 'wb') as output_file:
output_file.write(data)


def urlopen_headers(url, filename):
headers_u = ("User-Agent", user_agent)
opener = request.build_opener()
opener.addheaders = [headers_u]
print(opener.addheaders, 'opener.addheaders')
file = opener.open(url) # >>> type(file) <class 'http.client.HTTPResponse'>
data = file.read() # data is of tpye bytes !!!! not str!!
with open(filename, 'wb') as output_file:
output_file.write(data) #clear the cached data from urlretrieve

def urlopen_request(url):
key = '王镇'
key = request.quote(key) #转义url中的特殊字符
req = request.Request(url + "?wd=" + key)
req.add_header("User-Agent", user_agent)
file = request.urlopen(req)
data = file.read

def post_open(url):
post_data = parse.urlencode({
"name": 'wangzhen',
"pass": '12345'
}).encode("utf-8")
req = request.Request(url, post_data)
req.add_header("User-agent", user_agent)
file = request.urlopen(req)

if "__main__" == __name__:

urlopen(current_crawl_url, local_file_path) #自动下载,并且存储为本地文件
urlopen_headers(current_crawl_url, local_file_path_headers) #自动下载,并且存储为本地文件



Binary file added modules/shelve/shelve.txt
Binary file not shown.
24 changes: 24 additions & 0 deletions socket/socketServer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# from socketserver import TCPServer, StreamRequestHandler
# class Handler(StreamRequestHandler):
# def handle(self):
# addr = self.request.getpeername()
# print('Got connection from: ', addr)
# self.wfile.write('Thank you from connecting')

# server = TCPServer( ("", 5000), Handler)
# server.serve_forever()

from socketserver import TCPServer, StreamRequestHandler
class Handler(StreamRequestHandler):
def handle(self):
addr = self.request.getpeername()
print('Got connection from', addr)
self.wfile.write('Thank you for connecting')
server = TCPServer(('', 1234), Handler)
server.serve_forever()

'''
fork
thread
asynchronous I/O
'''

0 comments on commit dc00507

Please sign in to comment.