keyword

贴吧 西部世界 findall csv

Checkout

  1. 抓取西部世界贴吧一帖的每楼层的用户名、内容、发表时间
  2. 保存所选网页仅HTML
  3. 寻找所要匹配内容的正则
  4. 打开文件,匹配内容
  5. 保存到 csv 文件中
import re
import csv

with open('index.html') as f:
    source = f.read()

# 先抓大:先抓取所有楼层,返回楼层列表
all_floors = re.findall('j_l_post clearfix  "(.*?)p_props_tail', source, re.S)

# 再抓小:遍历楼层,抓取用户名、内容、发帖时间
result_list = []
for each in all_floors:
    result_dict = {
        'username': re.findall('username="(.*?)"', each, re.S)[0],
        'content': re.findall('j_d_post_content ">(.*?)<', each, re.S)[0].replace('         ', ''),
        'deploy_time': re.findall('tail-info">(2017-.*?)<', each)[0]
    }
    result.append(result_dict)

for each in result_list:
    print each['name']
    print each['content']
    print each['deploy_time']

with open('result.csv', 'w') as f:
     writer = csv.DictWriter(f, fieldnames=['username', 'content', 'deploy_time'])
     writer.writeheader()
     writer.writerows(result_list)

results matching ""

    No results matching ""