[TOC]

urllib

1. request

request存在于urllib库中,导入方法:
from urllib import request

1.1 urlopen

urlopen的方法以及参数如下:

1
def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, *, cafile=None, capath=None, cadefault=False, context=None)

用于打开一个url

  • url: 打开的url
  • data: POST的头数据
  • timeout: 超时时间

用法如下:

1
2
3
4
5
6
7
8
9
10
11
12
def demo():
try:
s = request.urlopen(_url, timeout=3)
except request.HTTPError as e:
print(e)
finally:
pass
msg = s.info()
data = s.read().decode('utf-8')
#print_list(dir(s)) # print all functions and proporties
print(s.status)
print(msg)

urlopen返回的msg的方法:

  • info(): 返回一个HTTPMessage对象

返回response headers信息

  • read(size)
  • readline()
  • readlines()
  • close()
  • getcode(): 获得应答码

1.2 urlretrieve

用于下载一个远程文件

1
def urlretrieve(url, filename=None, reporthook=None, data=None)
  • url: 地址
  • filename: 保存文件名
  • reporthook: 下载状态报告
  • data: POST的头数据
  • 返回(filename, HTTPMessage)

reporthook
def progress(blk, blk_size, total_size)

  • blk: 传输的块数
  • blk_size: 块大小
  • total_size: 数据总大小

测试代码

1
2
3
4
5
6
7
8
9
10
def progress(blk, blk_size, total_size):
# 1.block num
# 2.block size
# 3.data size
print('%d/%d - %.02f%%' % (blk * blk_size, total_size, (float)(blk_size * blk) * 100 / total_size))

def retrieve():
request.urlretrieve(_url, 'index.html',reporthook=progress) # download a url to a file
#print('fname = ', fname)
#print_list(msg.items())

1.3 Request

1
2
3
4
5
class Request:

def __init__(self, url, data=None, headers={},
origin_req_host=None, unverifiable=False,
method=None):

可以使用Request添加或修改http头

1
2
3
4
5
6
def build_opener_demo():
# POST
data = {'username': 'jjhfen00', 'password': '962886do621374#'}
header = {'User-Agent': 'Mozilla/5.0'}
req = request.Request('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=header) # important to encode to utf-8
# req.add_header('', '')

1.4 build_opener

1
2
3
4
5
6
7
8
9
"""
Create an opener object from a list of handlers.

The opener will use several default handlers, including support
for HTTP, FTP and when applicable HTTPS.

If any of the handlers passed as arguments are subclasses of the
default handlers, the default handlers will not be used."""
def build_opener(*handlers)
  • 参数为Handler列表
    • BaseHandler为父类
    • HTTPHandler
    • HTTPSHandler
    • HTTPCookieProcessor
  • 返回OpenerDirector
1
2
3
4
5
6
7
8
9
10
11
def build_opener_demo():
# POST
data = {'username': 'jjhfen00', 'password': '962886do621374#'}
header = {'User-Agent': 'Mozilla/5.0'}
req = request.Request('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=header) # important to encode to utf-8
# req.add_header('', '')
opener = request.build_opener(request.HTTPHandler(debuglevel=1)) # output debug info
s = opener.open(req)
# request.urlopen('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=headers)
print(s.read(500).decode('utf-8'))
s.close()

output

1
2
3
4
5
6
7
8
9
10
<!DOCTYPE HTML>
<html lang="zh-cms-Hans" class="">
<head>
<meta charset="UTF-8">
<meta name="description" content="提供图书、电影、音乐唱片的推荐、评论和价格比较,以及城市独特的文化生活。">
<meta name="keywords" content="豆瓣,广播,登陆豆瓣">
<meta property="qc:admins" content="27514711322166375" />
<meta property="wb:webmaster" content="375d4a17a4fa24c2" />
<meta name="mobile-agent" content="format=xhtml; url=http://m.douban.com">
<title>豆瓣</title>

2. parse

2.1 urlencode

  • 把字典数据转换为url编码
  • 用途
  • 对url参数进行编码
  • 对post上去的form数据进行编码
1
2
3
4
5
def urlencode():
params = {'score': 100, 'name': 'basic spider', 'comment': 'very good'}
qs = parse.urlencode(params) # trans dict to url
print(qs)
print(parse.parse_qs(qs)) # trans url to dict

output:

1
2
 name=basic+spider&score=100&comment=very+good
{'name': ['basic spider'], 'score': ['100'], 'comment': ['very good']}

2.2 parse_qs

  • 把url编码转换为字典数据
1
2
3
4
def parse_test():
qs = 'https://www.baidu.com/s?wd=python%E5%AD%A6%E4%B9%A0&rsv_spt=1&rsv_iqid=0x9949316d002fc16e&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=python&inputT=1288&rsv_t=332846lzDwGW5S%2FPcPxNwi%2BGLXjhhSsLTTEoq7CDUafIYdFz%2FW7jKJOOhPpuzg1oFlve&rsv_sug3=14&rsv_sug1=14&rsv_sug7=100&rsv_pq=f4c9c20c002dabfe&bs=python'
result = parse.parse_qs(qs)
print_dict(result)

output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
('https://www.baidu.com/s?wd', ['python学习'])
('rsv_sug7', ['100'])
('rsv_iqid', ['0x9949316d002fc16e'])
('oq', ['python'])
('rsv_bp', ['1'])
('issp', ['1'])
('rsv_t', ['332846lzDwGW5S/PcPxNwi+GLXjhhSsLTTEoq7CDUafIYdFz/W7jKJOOhPpuzg1oFlve'])
('rsv_idx', ['2'])
('rsv_sug1', ['14'])
('rsv_spt', ['1'])
('rsv_enter', ['1'])
('ie', ['utf-8'])
('rsv_sug3', ['14'])
('inputT', ['1288'])
('tn', ['baiduhome_pg'])
('bs', ['python'])
('rsv_pq', ['f4c9c20c002dabfe'])
('f', ['8'])

需要使用的类:

  • http.cookiejar
    • from http import
  • request.HTTPCookieProcessor

CookieJar:

1
2
3
4
5
6
7
class CookieJar:
"""Collection of HTTP cookies.

You may not need to know about this class: try
urllib.request.build_opener(HTTPCookieProcessor).open(url).
"""
def __init__(self, policy=None):

提供一个解析并且保存cookie的接口


HTTPCookieProcessor:

1
2
class HTTPCookieProcessor(BaseHandler):
def __init__(self, cookiejar=None):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def cookie_demo():
cookiejar_ = cookiejar.CookieJar()
handler = request.HTTPCookieProcessor(cookiejar_)
opener = request.build_opener(handler, request.HTTPHandler(debuglevel=1))
s = opener.open('http://www.douban.com')
#print(s.read(500).decode('utf-8'))
print_list(s.getheaders())
s.close()

print('=' * 70)
print('cookiejar = ', cookiejar_._cookies)
print('=' * 70)
s = opener.open('http://www.douban.com')
print(s.info()) # == print_dict(s.getheaders())
s.close()

output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
('Server', 'dae')
('Date', 'Sun, 28 Feb 2016 17:55:14 GMT')
('Content-Type', 'text/html; charset=utf-8')
('Content-Length', '94309')
('Connection', 'close')
('X-Douban-Mobileapp', '0')
('Expires', 'Sun, 1 Jan 2006 01:00:00 GMT')
('X-Douban-Newbid', 'Iit6RGmQAE4')
('Pragma', 'no-cache')
('Cache-Control', 'must-revalidate, no-cache, private')
('P3P', 'CP="IDC DSP COR ADM DEVi TAIi PSA PSD IVAi IVDi CONi HIS OUR IND CNT"')
('Set-Cookie', 'bid="Iit6RGmQAE4"; path=/; domain=.douban.com; expires=Mon, 27-Feb-2017 17:55:14 GMT')
('Set-Cookie', 'll="118172"; path=/; domain=.douban.com; expires=Mon, 27-Feb-2017 17:55:14 GMT')
('X-DAE-Node', 'sindar21c')
('X-DAE-App', 'sns')
('Strict-Transport-Security', 'max-age=0;')
======================================================================
cookiejar = {'.douban.com': {'/': {'ll': Cookie(version=0, name='ll', value='"118172"', port=None, port_specified=False, domain='.douban.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1488218114, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), 'bid': Cookie(version=0, name='bid', value='"Iit6RGmQAE4"', port=None, port_specified=False, domain='.douban.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1488218114, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False)}}}
======================================================================
Server: dae
Date: Sun, 28 Feb 2016 17:55:14 GMT
Content-Type: text/html; charset=utf-8
Content-Length: 94309
Connection: close
X-Douban-Mobileapp: 0
Expires: Sun, 1 Jan 2006 01:00:00 GMT
Pragma: no-cache
Cache-Control: must-revalidate, no-cache, private
X-DAE-Node: sindar17b
X-DAE-App: sns
Strict-Transport-Security: max-age=0;