github源碼地址:
https://github.com/kuishou68/python
各類圖表的實(shí)現(xiàn)效果
爬取的說(shuō)說(shuō)內(nèi)容
個(gè)性化說(shuō)說(shuō)內(nèi)容詞云圖
每年發(fā)表說(shuō)說(shuō)總數(shù)柱狀圖、每年點(diǎn)贊和評(píng)論折線圖
7天好友動(dòng)態(tài)柱狀圖、餅圖
使用方法
按照你的谷歌瀏覽器下載指定版本的驅(qū)動(dòng) http://chromedriver.storage.googleapis.com/index.html
驅(qū)動(dòng)跟兩個(gè)python腳本放入同目錄,我的版本是90.0.4430的,查看你自己的版本,下載后把我的chromedriver.exe替換掉!
這里用到了很多第三方包,鼠標(biāo)放在報(bào)紅的包名下,用alt+enter導(dǎo)包,如果失敗則在控制臺(tái)用下面的必殺技
1
|
pip install 包名 - i http: / / pypi.douban.com / simple / - - trusted - host pypi.douban.com |
主要代碼
qq空間txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
import time from selenium import webdriver from lxml import etree # 這里一定要設(shè)置編碼格式,防止后面寫(xiě)入文件時(shí)報(bào)錯(cuò) friend = '1569339843' # 朋友的qq號(hào),朋友的空間要求允許你能訪問(wèn) user = '783533896' # 你的qq號(hào) pw = '1323mkonji.@' # 你的qq密碼 # 獲取瀏覽器驅(qū)動(dòng) chrome_driver = 'chromedriver.exe' driver = webdriver.chrome(executable_path = chrome_driver) # 瀏覽器窗口最大化 driver.maximize_window() # 瀏覽器地址定向?yàn)閝q登陸頁(yè)面 driver.get( "http://i.qq.com" ) # 所以這里需要選中一下frame,否則找不到下面需要的網(wǎng)頁(yè)元素 driver.switch_to.frame( "login_frame" ) time.sleep( 3 ) # 自動(dòng)點(diǎn)擊賬號(hào)登陸方式 driver.find_element_by_id( "switcher_plogin" ).click() time.sleep( 3 ) # 賬號(hào)輸入框輸入已知qq賬號(hào) driver.find_element_by_id( "u" ).send_keys(user) time.sleep( 5 ) # 密碼框輸入已知密碼 driver.find_element_by_id( "p" ).send_keys(pw) time.sleep( 5 ) # 自動(dòng)點(diǎn)擊登陸按鈕 driver.find_element_by_id( "login_button" ).click() time.sleep( 5 ) # 讓webdriver操縱當(dāng)前頁(yè) driver.switch_to.default_content() time.sleep( 5 ) # 跳到說(shuō)說(shuō)的url, friend你可以任意改成你想訪問(wèn)的空間 driver.get( "http://user.qzone.qq.com/" + friend + "/311" ) time.sleep( 5 ) next_num = 0 # 初始“下一頁(yè)”的id while true: # 下拉滾動(dòng)條,使瀏覽器加載出動(dòng)態(tài)加載的內(nèi)容, # 我這里是從1開(kāi)始到6結(jié)束 分5 次加載完每頁(yè)數(shù)據(jù) for i in range ( 1 , 6 ): height = 20000 * i # 每次滑動(dòng)20000像素 strword = "window.scrollby(0," + str (height) + ")" driver.execute_script(strword) time.sleep( 4 ) # 很多時(shí)候網(wǎng)頁(yè)由多個(gè)<frame>或<iframe>組成,webdriver默認(rèn)定位的是最外層的frame, # 所以這里需要選中一下說(shuō)說(shuō)所在的frame,否則找不到下面需要的網(wǎng)頁(yè)元素 driver.switch_to.frame( "app_canvas_frame" ) selector = etree.html(driver.page_source) divs = selector.xpath( '//*[@id="msglist"]/li/div[3]' ) # 這里使用 a 表示內(nèi)容可以連續(xù)不清空寫(xiě)入 with open ( 'qq_word.txt' , 'a' , encoding = "utf-8" ) as f: for div in divs: qq_name = div.xpath( './div[2]/a/text()' ) qq_content = div.xpath( './div[2]/pre/text()' ) qq_time = div.xpath( './div[4]/div[1]/span/a/text()' ) qq_praise = div.xpath( './div[4]/div[2]/span/span/a[2]/text()' ) qq_comment = div.xpath( './div[4]/div[2]/a[3]/text()' ) qq_name = qq_name[ 0 ] if len (qq_name) > 0 else '' qq_content = qq_content[ 0 ] if len (qq_content) > 0 else '' qq_content = qq_content.replace( '\n' , ' ' ) qq_time = qq_time[ 0 ] if len (qq_time) > 0 else '' qq_praise = qq_praise[ 0 ] if len (qq_praise) > 0 else '' qq_comment = qq_comment[ 0 ] if len (qq_comment) > 0 else '' print (qq_name, qq_time, qq_content, qq_praise, qq_comment) f.write(qq_content + "\n" ) # 當(dāng)已經(jīng)到了尾頁(yè),“下一頁(yè)”這個(gè)按鈕就沒(méi)有id了,可以結(jié)束了 if driver.page_source.find( 'pager_next_' + str (next_num)) = = - 1 : break # 找到“下一頁(yè)”的按鈕,因?yàn)橄乱豁?yè)的按鈕是動(dòng)態(tài)變化的,這里需要?jiǎng)討B(tài)記錄一下 driver.find_element_by_id( 'pager_next_' + str (next_num)).click() # “下一頁(yè)”的id next_num + = 1 # 因?yàn)樵谙乱粋€(gè)循環(huán)里首先還要把頁(yè)面下拉,所以要跳到外層的frame上 driver.switch_to.parent_frame() # 關(guān)閉瀏覽器 driver.quit() |
各種圖表的生成
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
import pandas as pd from pyecharts.charts import bar from pyecharts.charts import pie import pyecharts.options as opts from pyecharts.charts import line import re df_excel = pd.read_excel( 'qq_excel.xlsx' ) # 默認(rèn)讀取sheet=0 pandas dataframe' def gettimestr(row): item = row[ '時(shí)間' ] if pd.isnull(item) | pd.isna(item): return data = item.split( '年' )[ 0 ] return data # 按年統(tǒng)計(jì)說(shuō)說(shuō)數(shù)量 def readcount(result, row): timedata = gettimestr(row) if timedata = = none: return if timedata in result.keys(): result[timedata] + = 1 else : result[timedata] = 1 # 按年統(tǒng)計(jì)說(shuō)說(shuō)點(diǎn)贊數(shù) def readthumb(result, row): item = row[ '贊' ] if pd.isnull(item): return # data = re.match(r'贊\((\d+).*', item, re.m | re.i) if len (item.split( "(" )) < = 1 : return data = item.split( "(" )[ 1 ].split( ")" )[ 0 ] timedata = gettimestr(row) if timedata = = none: return if timedata in result.keys(): result[timedata] + = int (data) else : result[timedata] = int (data) # 按年統(tǒng)計(jì)說(shuō)說(shuō)評(píng)論數(shù) def readcomment(result, row): item = row[ '評(píng)論' ] if pd.isnull(item): return # data = re.match(r'贊\((\d+).*', item, re.m | re.i) if len (item.split( "(" )) < = 1 : return data = item.split( "(" )[ 1 ].split( ")" )[ 0 ] timedata = gettimestr(row) if timedata = = none: return if timedata in result.keys(): result[timedata] + = int (data) else : result[timedata] = int (data) def readexcel(df_excel): count = {} result = {} thumb = {} comment = {} for index, row in df_excel.iterrows(): readcount(count, row) readthumb(thumb, row) readcomment(comment, row) result[ 'count' ] = count result[ 'thumb' ] = thumb result[ 'comment' ] = comment return result def getkeyandval(keyword): data = readexcel(df_excel).get(keyword) key = [] value = [] for item in data.keys(): key.append(item) value.append(data[item]) key.reverse() value.reverse() return [key, value] # 統(tǒng)計(jì)每年發(fā)表說(shuō)說(shuō)次數(shù)柱狀圖 def paintbar(): count = readexcel(df_excel).get( 'count' ) # v1 版本開(kāi)始支持鏈?zhǔn)秸{(diào)用 data = getkeyandval( 'count' ) print (data[ 0 ]) d = ( bar() .add_xaxis(data[ 0 ]) .add_yaxis( "每年發(fā)表說(shuō)說(shuō)總數(shù)" , data[ 1 ]) .render( "每年發(fā)表說(shuō)說(shuō)總數(shù)柱狀圖.html" ) ) paintbar() # 統(tǒng)計(jì)點(diǎn)贊和評(píng)論折線圖 def paintline(): commentdata = getkeyandval( 'comment' ) thumbdata = getkeyandval( 'thumb' ) xaxis_data = commentdata[ 0 ] commentvalue = commentdata[ 1 ] thumbvalue = thumbdata[ 1 ] d = ( line() .add_xaxis(xaxis_data = xaxis_data) .add_yaxis( "每年評(píng)論數(shù)" , y_axis = commentvalue) .add_yaxis( "每年點(diǎn)贊數(shù)" , y_axis = thumbvalue) .render( "每年點(diǎn)贊和評(píng)論折現(xiàn)圖.html" ) # 輸出圖形 ) paintline() |
其他代碼自行下載項(xiàng)目查看
以上就是python爬取網(wǎng)頁(yè)版qq空間,生成各類圖表的詳細(xì)內(nèi)容,更多關(guān)于python 爬取qq空間的資料請(qǐng)關(guān)注服務(wù)器之家其它相關(guān)文章!
原文鏈接:https://github.com/kuishou68/python