keyword
贴吧 西部世界 findall csv
Checkout
- 抓取西部世界贴吧一帖的每楼层的用户名、内容、发表时间
- 保存所选网页仅HTML
- 寻找所要匹配内容的正则
- 打开文件,匹配内容
- 保存到 csv 文件中
import re
import csv
with open('index.html') as f:
source = f.read()
# 先抓大:先抓取所有楼层,返回楼层列表
all_floors = re.findall('j_l_post clearfix "(.*?)p_props_tail', source, re.S)
# 再抓小:遍历楼层,抓取用户名、内容、发帖时间
result_list = []
for each in all_floors:
result_dict = {
'username': re.findall('username="(.*?)"', each, re.S)[0],
'content': re.findall('j_d_post_content ">(.*?)<', each, re.S)[0].replace(' ', ''),
'deploy_time': re.findall('tail-info">(2017-.*?)<', each)[0]
}
result.append(result_dict)
for each in result_list:
print each['name']
print each['content']
print each['deploy_time']
with open('result.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=['username', 'content', 'deploy_time'])
writer.writeheader()
writer.writerows(result_list)