Python爬虫实战:汽车导购网站评论爬取(大数据帮你买好车)

前言

  四天入门Python(慕课网地址),三天入门简单爬虫(慕课网地址),八天撸完180行的代码,一路磕磕绊绊,一路的bugbugbug……,索性还是以比较快的速度解决了女票的要求(爬来的数据给女票写小论文用)。直接先码代码贴上,以后有空再对代码进行详解吧……

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#爬取爱卡汽车网站所有“纯电动”汽车的“车名”、“价格”、“级别”、“续航”、“电量”,并输出为html
#-*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
from urllib import request

#8个电动汽车的总页面的url
first_urls = []
for i in range(1, 9):
first_urls.append('http://newcar.xcar.com.cn/car/0-0-0-0-0-0-0-0-0-0-0-'+str(i)+'-1-0/')

#8个总页面下的所有款车的url
nodes_num = []
for url in first_urls:
response = request.urlopen(url)
html_cont = response.read()
soup = BeautifulSoup(html_cont.decode('gb2312'),'html.parser')
link_nodes = soup.find_all('a', class_="car_search_ps_list_a")#link_nodes是一个list列表
for i in link_nodes:
nodes_num.append('http://newcar.xcar.com.cn'+str(i['href']))

#构造datas-------------------------------------------------------------
datas = []
nodes_rev = []#收集有用的url,加工成口碑url存到此列表中
count = 0
for url_num in nodes_num:
count+=1

response_num = request.urlopen(url_num)
html_cont_num = response_num.read()
soup_num = BeautifulSoup(html_cont_num.decode('gbk'),'html.parser')

url_rev = url_num+'review.htm'
response_rev = request.urlopen(url_rev)
html_cont_rev = response_rev.read()
soup_rev = BeautifulSoup(html_cont_rev.decode('gbk'),'html.parser')

res_data = {}#dict列表,每轮res_data列表包含13个key(车名、价格......)
try:
link_node01 = soup_num.find("span", class_="lt_f1").get_text()
link_node02 = soup_num.find('div', class_="tt_h1").find("h1").get_text()
res_data['车名'] = link_node01+link_node02

link_node03 = soup_num.find('a', class_="com_price_menu").get_text()
#print "价格:", link_node03,"万"
res_data['价格'] = link_node03

link_node04 = soup_num.find_all('li', class_="w163")
res_data['级别'] = link_node04[0].get_text()[4:-1]
res_data['续航(km)'] = re.findall("\d+",link_node04[1].get_text())[-1]

'''link_node05 = soup_num.find('a', onclick="clicklog(124783);").get_text()
#print "电量:", re.findall("\d+",link_node05)[0]+"kWh"
res_data['电量'] = re.findall("\d+",link_node05)[0]'''

link_node06 = soup_rev.find('div', class_="synthesis").get_text()
res_data['综合评分'] = re.findall(r'\d+.\d+', link_node06)[0]

link_node07 = soup_rev.find_all('div', class_="bg")
res_data['外观'] = re.findall(r'\d+.\d+', link_node07[0].get_text())[0]
res_data['内饰'] = re.findall(r'\d+.\d+', link_node07[1].get_text())[0]
res_data['空间'] = re.findall(r'\d+.\d+', link_node07[2].get_text())[0]
res_data['舒适'] = re.findall(r'\d+.\d+', link_node07[3].get_text())[0]
res_data['续航'] = re.findall(r'\d+.\d+', link_node07[4].get_text())[0]
res_data['动力'] = re.findall(r'\d+.\d+', link_node07[5].get_text())[0]
res_data['操控'] = re.findall(r'\d+.\d+', link_node07[6].get_text())[0]
res_data['性价比'] = re.findall(r'\d+.\d+', link_node07[7].get_text())[0]

datas.append(res_data)
nodes_rev.append(url_rev)
print(count)
except:
print (count, "craw failed")

#构造datas_dis----------------------------------------------------------------------------
datas_dis = []
number = 0
for url in nodes_rev:#nodes_rev里存的是有用口碑url
url_page = []

try:
response_ = request.urlopen(url)
html_cont_ = response_.read()
soup_ = BeautifulSoup(html_cont_.decode('gbk'),'html.parser')

link_nodes = soup_.find_all('a', href="javascript:void(0);", rel="nofollow", class_="page")#取最后一页的页数
last_page = re.findall(r'\d+', link_nodes[-1]['onclick'])[0]#最后一页的页数,是字符型数据,如'4'

for i in range(1, int(last_page)+1):
url_page.append('http://newcar.xcar.com.cn/auto/index.php?r=reputation/reputation/GetAjaxKbList3&page='+str(i)+'&pserid='+re.findall(r'\d+', url)[0]+'&jh=0&wd=0')

except:
url_page.append(url)

number+=1
datas_02 = []
cout = 0#‘发表时间’、‘购车地点’和‘爱车评价’的插入点索引随url_的变化而变化
try:
for url_ in url_page:
response = request.urlopen(url_)
html_cont = response.read()
soup = BeautifulSoup(html_cont.decode('gbk'),'html.parser')

link_node_01 = soup.find_all('div', class_="name_lf")
for i in link_node_01:
res_data = {}
res_data['评论人'] = re.findall(r'\A(.+)',i.get_text()[4:])[0]#(.+)和.+都可以,‘\A’ 匹配字符串开头,r'\A(.+)'表示匹配字符串开头的任意字符(空格或换行符前)
datas_02.append(res_data)

link_node_02 = soup.find_all('div', class_="publish")
cout_ = cout
for i in link_node_02:
datas_02[cout_]['发表时间'] = re.findall(r'\d+-\d+-\d+', i.get_text())[0]
cout_+=1

link_node_03 = soup.find_all('div', class_="list_infor")
cout_ = cout
for i in link_node_03:
datas_02[cout_]['购车地点'] = re.findall(r'[[](.*?)[]]', i.get_text().replace('\n','').replace('\n','').replace(' ',''))[0]#先去掉所有空格和换行,再提取[xx市]中的xx市
cout_+=1

cout_ = cout
link_node_04 = soup.find_all('div', class_="review_post")
for i in link_node_04:
datas_02[cout_]['爱车评价'] = re.sub('[\n]+', '\n', i.get_text().strip())#strip()去掉头尾的空格和空行,re.sub('[\n]+', '\n', 'xxx')去掉字符串xxx中多余的空行
cout_+=1

cout+=len(link_node_01)

print(number, "craw sucessful")

except:
print (number, "craw failed")

datas_dis.append(datas_02)

#datas列表和datas_dis列表相结合--------------------------------------------------------------
tot = []#形式为:“tot = [[ , ], [ , ], [ , ],......, [ , ]]”每一个元素都是list列表,每个元素包含一个datas元素和datas_dis元素
for i in datas:
tot_ = []
tot_.append(i)
tot.append(tot_)

num_tot = 0
for j in datas_dis:
tot[num_tot].append(j)
num_tot += 1

#建立一个文件的输出对象fout,文件名为12121.html, 输出各项数据-----------------------------------
#Python默认的编码是:ascii,要输出为utf-8的话,要加encoding = 'utf-8'
fout = open('12121.html', 'w', encoding = 'utf-8')
fout.write("<html>")
fout.write("<body>")
fout.write("<table>")
for i in tot:
for j in i[1]:
fout.write("<tr>")
fout.write("<td>%s</td>" % i[0]['车名'])
fout.write("<td>%s</td>" % i[0]['价格'])
fout.write("<td>%s</td>" % i[0]['级别'])
fout.write("<td>%s</td>" % i[0]['续航(km)'])
#fout.write("<td>%s</td>" % i[0]['电量'])
fout.write("<td>%s</td>" % i[0]['综合评分'])
fout.write("<td>%s</td>" % i[0]['外观'])
fout.write("<td>%s</td>" % i[0]['内饰'])
fout.write("<td>%s</td>" % i[0]['空间'])
fout.write("<td>%s</td>" % i[0]['舒适'])
fout.write("<td>%s</td>" % i[0]['续航'])
fout.write("<td>%s</td>" % i[0]['动力'])
fout.write("<td>%s</td>" % i[0]['操控'])
fout.write("<td>%s</td>" % i[0]['性价比'])
fout.write("<td>%s</td>" % j['评论人'])
fout.write("<td>%s</td>" % j['发表时间'])
fout.write("<td>%s</td>" % j.get('购车地点'))
fout.write("<td>%s</td>" % j.get('爱车评价'))
fout.write("</tr>")
fout.write("</table>")
fout.write("</body>")
fout.write("</html>")
fout.close()

最后输出的是html文件,爬到了800条左右的评论(其中不乏几撮水军),用excel编辑完就是如下效果:

  后期的话,如果将网站所有车型的评论都爬取下来,估计得有几十万条评论,分析这些评论数据也会对自己将来购车有所帮助。

相关文章推荐

  Python爬虫实战-帮你选购成色完美的二手iPhone

————————— 本文结束 感谢您的阅读 —————————
谢谢你请我喝咖啡ლↂ‿‿ↂლ(支付宝扫一扫即可领红包, 消费时可抵现! 你省钱, 我赚钱, 多谢支持~)