RE:
import re
import time
with open("豆瓣电影Top250.html",encoding="utf-8") as f:
s=f.read()
#print(s)
start_time=time.time()
ret = re.findall('<li>.*?<div class="item">.*?<div class="hd">.*?<span class="title">(.+?)</span>.*? <span>(.+?)人评价</span>',s,re.S)
print(ret)
end_time=time.time()
elapsed_time=end_time-start_time
print("代码执行时间:",elapsed_time,"秒")
--------------------------------------------------------运行结果----------------------------------------------------------------------------------------
C:\Users\user\PycharmProjects\pythonProject1\venv\Scripts\python.exe C:\Users\user\PycharmProjects\pythonProject1\regularex\提取.py
[(‘肖申克的救赎’, ‘2912555’), (‘霸王别姬’, ‘2150218’), (‘阿甘正传’, ‘2171334’), (‘泰坦尼克号’, ‘2202721’), (‘这个杀手不太冷’, ‘2305057’), (‘千与千寻’, ‘2255218’), (‘美丽人生’, ‘1332295’), (‘辛德勒的名单’, ‘1110787’), (‘星际穿越’, ‘1848980’), (‘盗梦空间’, ‘2071045’), (‘楚门的世界’, ‘1720585’), (‘忠犬八公的故事’, ‘1404616’), (‘海上钢琴师’, ‘1685059’), (‘三傻大闹宝莱坞’, ‘1866833’), (‘放牛班的春天’, ‘1317617’), (‘机器人总动员’, ‘1321945’), (‘疯狂动物城’, ‘1948593’), (‘无间道’, ‘1370841’), (‘控方证人’, ‘565051’), (‘大话西游之大圣娶亲’, ‘1540664’), (‘熔炉’, ‘935056’), (‘教父’, ‘972589’), (‘触不可及’, ‘1119658’), (‘当幸福来敲门’, ‘1526688’), (‘末代皇帝’, ‘890123’)]
代码执行时间: 0.000997304916381836 秒
进程已结束,退出代码为 0
**************************************************************************************************************************************************************************************
BeautifulSoup:
from bs4 import BeautifulSoup
import re
import time
soup = BeautifulSoup(open("豆瓣电影Top250.html",encoding='utf-8'),'html.parser')
start_time=time.time()
rets = soup.findAll(class_="item")
comment_reg=re.compile(r"(\d+)人评价")
movie_list=[{'title':ret.find(class_='title').text,'rating_num':ret.find(class_='rating_num').text,
'comment_num':ret.find(string=comment_reg)[:-3]}for ret in rets]
#for ret in rets:
# title = ret.find(class_='title').text
# rating_num=ret.find(class_='rating_num').text
# comment_num=ret.find(string=comment_reg)[:-3]
# movie={}
# movie['title']=title
# movie['rating_num']=rating_num
# movie['comment_num']=comment_num
# movie_list.append(movie)
# print(title,rating_num,comment_num)
# for movie in movie_list:
# print(movie)
print(movie_list)
end_time=time.time()
elasped_time=end_time-start_time
print("代码执行时间: ",elasped_time,"秒")
------------------------------------------------------------------------运行结果--------------------------------------------------------------------------
C:\Users\user\PycharmProjects\pythonProject1\venv\Scripts\python.exe C:\Users\user\PycharmProjects\pythonProject1\http\review\BS4_L\提取电影信息.py
[{‘title’: ‘肖申克的救赎’, ‘rating_num’: ‘9.7’, ‘comment_num’: ‘2912555’}, {‘title’: ‘霸王别姬’, ‘rating_num’: ‘9.6’, ‘comment_num’: ‘2150218’}, {‘title’: ‘阿甘正传’, ‘rating_num’: ‘9.5’, ‘comment_num’: ‘2171334’}, {‘title’: ‘泰坦尼克号’, ‘rating_num’: ‘9.5’, ‘comment_num’: ‘2202721’}, {‘title’: ‘这个杀手不太冷’, ‘rating_num’: ‘9.4’, ‘comment_num’: ‘2305057’}, {‘title’: ‘千与千寻’, ‘rating_num’: ‘9.4’, ‘comment_num’: ‘2255218’}, {‘title’: ‘美丽人生’, ‘rating_num’: ‘9.6’, ‘comment_num’: ‘1332295’}, {‘title’: ‘辛德勒的名单’, ‘rating_num’: ‘9.6’, ‘comment_num’: ‘1110787’}, {‘title’: ‘星际穿越’, ‘rating_num’: ‘9.4’, ‘comment_num’: ‘1848980’}, {‘title’: ‘盗梦空间’, ‘rating_num’: ‘9.4’, ‘comment_num’: ‘2071045’}, {‘title’: ‘楚门的世界’, ‘rating_num’: ‘9.4’, ‘comment_num’: ‘1720585’}, {‘title’: ‘忠犬八公的故事’, ‘rating_num’: ‘9.4’, ‘comment_num’: ‘1404616’}, {‘title’: ‘海上钢琴师’, ‘rating_num’: ‘9.3’, ‘comment_num’: ‘1685059’}, {‘title’: ‘三傻大闹宝莱坞’, ‘rating_num’: ‘9.2’, ‘comment_num’: ‘1866833’}, {‘title’: ‘放牛班的春天’, ‘rating_num’: ‘9.3’, ‘comment_num’: ‘1317617’}, {‘title’: ‘机器人总动员’, ‘rating_num’: ‘9.3’, ‘comment_num’: ‘1321945’}, {‘title’: ‘疯狂动物城’, ‘rating_num’: ‘9.2’, ‘comment_num’: ‘1948593’}, {‘title’: ‘无间道’, ‘rating_num’: ‘9.3’, ‘comment_num’: ‘1370841’}, {‘title’: ‘控方证人’, ‘rating_num’: ‘9.6’, ‘comment_num’: ‘565051’}, {‘title’: ‘大话西游之大圣娶亲’, ‘rating_num’: ‘9.2’, ‘comment_num’: ‘1540664’}, {‘title’: ‘熔炉’, ‘rating_num’: ‘9.4’, ‘comment_num’: ‘935056’}, {‘title’: ‘教父’, ‘rating_num’: ‘9.3’, ‘comment_num’: ‘972589’}, {‘title’: ‘触不可及’, ‘rating_num’: ‘9.3’, ‘comment_num’: ‘1119658’}, {‘title’: ‘当幸福来敲门’, ‘rating_num’: ‘9.2’, ‘comment_num’: ‘1526688’}, {‘title’: ‘末代皇帝’, ‘rating_num’: ‘9.3’, ‘comment_num’: ‘890123’}]
代码执行时间: 0.0039997100830078125 秒
进程已结束,退出代码为 0
*****************************************************************************************************************************************************************************************
LXML-etree:
from lxml import etree
import time
with open("豆瓣电影Top250.html",encoding='utf-8') as f:
data = f.read()
start_time=time.time()
selector = etree.HTML(data)
ret = selector.xpath('//div[@class="item"]')
for item in ret:
title =item.xpath(".//span[@class='title'][1]/text()")[0]
rating_num=item.xpath(".//span[@class='rating_num']/text()")[0]
comment_num=item.xpath(".//div[@class='star']//span[last()]/text()")[0][:-3]
print(title,rating_num,comment_num)
end_time=time.time()
elapsed_time=end_time-start_time
print("执行时间: ",elapsed_time,"秒")
----------------------------------------------------------------------运行结果----------------------------------------------------------------------------
C:\Users\user\PycharmProjects\pythonProject1\venv\Scripts\python.exe C:\Users\user\PycharmProjects\pythonProject1\xPath\豆瓣电影提取.py
肖申克的救赎 9.7 2912555
霸王别姬 9.6 2150218
阿甘正传 9.5 2171334
泰坦尼克号 9.5 2202721
这个杀手不太冷 9.4 2305057
千与千寻 9.4 2255218
美丽人生 9.6 1332295
辛德勒的名单 9.6 1110787
星际穿越 9.4 1848980
盗梦空间 9.4 2071045
楚门的世界 9.4 1720585
忠犬八公的故事 9.4 1404616
海上钢琴师 9.3 1685059
三傻大闹宝莱坞 9.2 1866833
放牛班的春天 9.3 1317617
机器人总动员 9.3 1321945
疯狂动物城 9.2 1948593
无间道 9.3 1370841
控方证人 9.6 565051
大话西游之大圣娶亲 9.2 1540664
熔炉 9.4 935056
教父 9.3 972589
触不可及 9.3 1119658
当幸福来敲门 9.2 1526688
末代皇帝 9.3 890123
执行时间: 0.0030667781829833984 秒
进程已结束,退出代码为 0
总结:
RE速度最快,
lxml次之
soup最慢

