2017.07.04
Keywords
- re.findall
- re.find
- re.search
- csv
- lxml
- xpath
- BeautifulSoup4
Checkout
re.findall, .*?
import re re.findall('<img username=(.*?) class>', src_str, re.S) # re.S 可以包含换行符 re.find('<img username=(.*?) class>', src_str, re.S) # find 匹配到第一个符合条件的就结束匹配 re.search().group()/group(1)/group(2) # ?
csv 文件的写入
import csv with open('result.csv', 'w', encoding="UTF-8") as f: # 定义字典数据形式的表格字段格式写入对象:文件,字段名 write = csv.DictWriter(f, fieldnames=['username', 'reply', 'time']) # 写入表头 write.writeheader() # 写入行数据 write.writerows(rows_dic)
lxml 使用
from lxml import html import requests html_source = requests.get('www.baidu.com').text selector_object = html.fromstring(html_source) # 提取文本 content_list = selector_object.xpath('//ui/li[@class="active"]/text()') content_list = selector_object.xpath('//ui/li[starts-with(@class, "active")]/text()') # starts-with content_list = selector_object.xpath('//ui/li[@class="active"]/text()') content = selector.xpath('//ui/li/string(.)') # 提取属性 title = selector.xpath('//a/@title')
BeautifuSoup4
pip install beautifulsoup4 from bs4 import BeautifuiSoup # BeautifulSoup()