Python 爬取知乎高赞回答并绘制词共现网络

Python 爬取知乎高赞回答并绘制词共现网络

今天我们来看下如何用 Python 爬取知乎指定话题的内容,包括标题、点赞、查看等数据,并存储到 excel 和 Mongo 数据库;

还记得我们的爬虫三部曲吗?对了,就是请求 👉 解析 👉 存储,代码的大部分内容都有详细注释,我们直接上代码吧!

导入相关库

1
2
3
4
5
6
7
import requests # 网络请求
import json # 解析 json 数据
import sys # 系统操作
import xlwt # 读写 excel
from bs4 import BeautifulSoup as BS # 解析网页
import pymongo # 连接 Mongo 数据库
from time import sleep # 设置延时

定义变量和参数

1
2
3
4
5
6
7
8
9
10
11
12
13
queries = [u'疫情的影响']  # 搜索关键词,可以多个['新冠病毒','武汉疫情']等
entries = [u'search_terms', u'search_rank', u'question_url', u'question_title',
u'question_follow_num', u'question_view_num',
u'question_top_answer_id', u'question_top_answer_username'] # 要爬取的内容

headers = {
'cookie': "",
'Host': 'www.zhihu.com',
'Referer': 'http://www.zhihu.com/',
'user-agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) Ap"
"pleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safar"
"i/604.1",
'Accept-Encoding': 'gzip'} # 请求头

网络请求

网络请求的话我们直接利用知乎的 V4 api ,请求 URL:https://www.zhihu.com/api/v4/questions/{qid}/answers
传入 qid 也就是 question 的 id 即可获得相应回答,关于知乎 api 接口的相关解析,可参考此链接https://github.com/YaoZeyuan/ZhihuHelp/issues/89

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def GET(url, headers):
page = ""

# 正常请求
while page == "":
try:
page = requests.get(url, headers=headers)
assert(page != None)
return page
# 连接错误,延时5秒再请求
except requests.ConnectionError as e:
print("ERR: {}".format(e))
print("sleep for 5 seconds")
sleep(5)
print("continuing")
continue

如果碰到连接错误,我们就等一会继续请求,直到获得内容,这个延时机制可以通过 python 的标准库 time 来实现。

解析链接内容

我们首先通过自定义的 GET 获取原始内容,再用 BeautifualSoup 解析成标准的 DOM 树,这样方便后面的内容提取,
BeautifualSoup 转为 soup 对象后,可以直接通过 class id 、css selector 、xpath 等方法进行提取,再将得到的字符串转为 json 进行解析。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def get_link_content(question_url, query_num):
"""
:question_url 问题链接
:query_num 问题 id
"""
html = GET(question_url, headers=headers) # 获取网页内容
bs = BS(html.text, 'html.parser') # 采用 html.parser 一个 html 解析器,类似的还有 lxml 等
res = bs.find("script", id='js-initialData') # 按 id 查找对应内容
dt = json.loads(res.text) # 将字符串转为 json
userid = list(dt['initialState']['entities']['questions'].keys())[0] # 按层索引得到用户 id
question_view_num = dt['initialState']['entities']['questions'][userid]['visitCount'] # 按层索引得到问题查看数
question_follow_num = dt['initialState']['entities']['questions'][userid]['followerCount'] # 按层索引得到问题 follow 数

answer_url_tmp = "https://www.zhihu.com/api/v4/questions/{}" \
"/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Crew" \
"ard_info%2Cis_collapsed%2Cannotation_action%2Canotation_detail%2Ccollap" \
"se_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count" \
"%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment" \
"_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_" \
"info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized" \
"%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recogn" \
"ized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%" \
"5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&li" \
"mit=5&offset=0&platform=desktop&sort_by=default" # 构造回答链接;关于知乎 v4 api 参数解析可参考 https://www.jianshu.com/p/86139ab70b86

answer_url = answer_url_tmp.format(query_num) # 传入参数
ans_response = GET(answer_url, headers=headers) # 得到 response 内容

ans_response.encoding = "utf-8" # 解码
L3 = json.loads(ans_response.text) # 加载 json 内容
question_top_answer_username = L3['data'][0]['author']['name'] # 按层索引得到最高赞回答者用户名
question_top_answer_id = L3['data'][0]['author']['id'] # 按层索引得到最高赞回答 id

return (question_follow_num, question_view_num,
question_top_answer_username, question_top_answer_id) # 返回各个字段
# print(question_top_answer_username)

存储解析内容

内容存储我们可以存到数据库,也可以存到本地 excel,或者 csv、json 等格式,看个人喜好。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def get_query_content(query):
global ct, total, worksheet, collection # 定义几个全局变量用来计数等
for i in range(50): # 想要爬的主题数量
url_tmp = "https://www.zhihu.com/api/v4/search_v3?t=general&q={}" \
"&correction=1&offset={}&limit=20&lc_idx={}&show_all_topics=0&se" \
"arch_hash_id=3d21e38b5e93277e80022091d9992046&vertical_info=1" \
"%2C1%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1"

url = url_tmp.format(query, i * 20, i * 20 + 7) #构造 url
print(url)
response = GET(url, headers=headers) # 请求内容
response.encoding = "utf-8"
if response.status_code != 200:
continue

L1 = json.loads(response.text, strict=False)
for index in range(len(L1['data'])): # 循环解析内容
item = L1['data'][index]
if('object' not in item.keys() or 'type' not in item['object'].keys()):
continue
if (item['object']['type'] != 'answer'):
continue

link = "https://www.zhihu.com/question/" + item['object']['question']['id']
if (collection.count_documents({"search_terms": query,
"question_url": link}) != 0):
continue

ct += 1
total += 1
# print(str(ct) + ", " + str(item['index']) + ": " + item['highlight']['title'])

# 通过 link 和 id 获取以下几个数据
(q_follow_num, q_view_num, q_top_ans_usrname,
q_top_ans_id) = get_link_content(link, item['object']['question']['id'])


title = item['highlight']['title'].replace("</em>", "")
title = title.replace("<em>", "")
# print(title + "\n")

data = [query, ct, link, title, q_follow_num, q_view_num,
q_top_ans_usrname, q_top_ans_id] # 组合数据为 data 列表

for i, entry in enumerate(entries, start=0): # 循环对应的变量写入 worksheet
worksheet.write(total, i, label=data[i])
# 要更新的数据
update_data = {"search_terms": query,
"search_rank": ct,
"question_url": link,
"question_title": title,
"question_follow_num": q_follow_num,
"question_view_num": q_view_num,
"question_top_answer_username": q_top_ans_usrname,
"question_top_answer_id": q_top_ans_id}

doc_ct = collection.estimated_document_count() # 文档的数量
collection.insert_one(update_data) # 插入一条数据
doc_ct_new = collection.estimated_document_count() # 再次计数
assert(doc_ct == doc_ct_new - 1) # 断言是否插入成功

上面这段代码中range(50)设置的是要爬取的相关主题数量,我们可以先大致翻一下有价值的条目,然后调整这个参数,
如果想要爬的关键字相关的主题较少会报错,设置小一点即可。

整合开始爬取

最后我们初始化 workbook 和 Mongodb,get_query_content()来启动小爬虫吧,运行完毕会在设置的 excel_path 生成结果文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
if __name__ == '__main__':

path = './' # 根目录
excel_path = path + '/0214.xls' # 结果保存路径及名称
mongdb_name = 'mongodb://127.0.0.1:27017/' # mongo 地址

# search_rank order
ct = 0
# counting num write to excel
total = 0

# 创建并初始化 excel 对象
workbook = xlwt.Workbook(encoding='utf-8')

worksheet = workbook.add_sheet('my_worksheet')
for i, entry in enumerate(entries, start=0):
worksheet.write(0, i, label=entry) # 依次写入 entries 中的变量

workbook.save(excel_path) # 保存数据到 excel

# 初始化 Mongo 数据库
client = pymongo.MongoClient(mongdb_name)

# 新建一个名叫 spiderdb 的数据库
mydb = client['spiderdb']

# 列出当前数据库中的集合
collist = mydb.list_collection_names()

# 如果 zh_answer 已经存在就提示
if "zh_answer" in collist:
print("already exists collection with name "
"zh_answer in database spiderdb")
# 没有就创建
collection = mydb['zh_answer']

# 清空数据库中原有的内容
collection.delete_many({}) # clear any preexisting data

# 循环获取数据分别存入 excel 和 Mongo
for query in queries:
ct = 0
get_query_content(query)
workbook.save(excel_path)

0214.xls

我们来看一下爬取到的内容,后面我们会继续爬取对应话题下的所有回答,来做文本分析:

Python 处理好文本之后,我们既可以导入 Gephi 绘制,也可以利用 R 中的相关包进行处理,我们先来看一下最终的结果:

  • Gephi 绘制:

  • R 绘制:

知识星球附件链接:https://t.zsxq.com/r3vzne2

# Python, R

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×