keywords
爬虫 抓包 http cookies urllib
Checkout
- 爬虫是抓取网络上数据的程序,用于内容网站、信息系统、搜索引擎
- 请求与响应通过 http 协议传输数据包来交换信息
- cookies 用来存储状态,浏览器端和服务器端都有保存,有时间限制
- urllib 用来发送请求,获取状态等,获取数据等
import urllib
# result = urllib.urlopen(str_url)
result = urllib.urlopen('https://littlecarson.github.io')
# 0. 读取状态吗
print result.getcode()
# 1. 读取所有内容
print result.read()
# 2. 顺序读取一行
for i in range(4)
print 'line %d: %s' % (i + 1, result.readline())
print result.readline()
# 3. 读取多行,返回行列表
line_list = result.readlines()
for i in line_list:
print i
# 4. 获取头信息 info()
msg = result.info()
print_list(dir(msg))
print_list(msg.headers)
print_list(msg.items())
print msg.getheader('Contetn-Type')
# 5. urllib.urlretrieve(str_url, str_file_url, fun_reporthook)
def progress(blk, blk_size, total_size):
print '%d-%d -- %.02f%%' % (blk*blk_size, total_size, blk*blk_size/total_size)
filename, msg = urllib.urlretrieve('https://littlecarson.github.io', 'index.html', reporthook=progress)
print filename
print_list(msg.items())
def print_list(list):
for i in list:
print i