keywords

爬虫 抓包 http cookies urllib

Checkout

  1. 爬虫是抓取网络上数据的程序,用于内容网站、信息系统、搜索引擎
  2. 请求与响应通过 http 协议传输数据包来交换信息
  3. cookies 用来存储状态,浏览器端和服务器端都有保存,有时间限制
  4. urllib 用来发送请求,获取状态等,获取数据等
import urllib

# result = urllib.urlopen(str_url)

result = urllib.urlopen('https://littlecarson.github.io')

# 0. 读取状态吗
print result.getcode()

# 1. 读取所有内容
print result.read()

# 2. 顺序读取一行
for i in range(4)
    print 'line %d: %s' % (i + 1, result.readline()) 
print result.readline()

# 3. 读取多行,返回行列表
line_list = result.readlines()
for i in line_list:
    print i

# 4. 获取头信息 info()

msg = result.info()
print_list(dir(msg))
print_list(msg.headers)
print_list(msg.items())
print msg.getheader('Contetn-Type')

# 5. urllib.urlretrieve(str_url, str_file_url, fun_reporthook)
def progress(blk, blk_size, total_size):
    print '%d-%d -- %.02f%%' % (blk*blk_size, total_size, blk*blk_size/total_size)
filename, msg = urllib.urlretrieve('https://littlecarson.github.io', 'index.html', reporthook=progress)
print filename
print_list(msg.items())

def print_list(list):
    for i in list:
        print i

results matching ""

    No results matching ""