淘寶的頁(yè)面很復(fù)雜,如果使用分析ajax或者js的方式,很麻煩
抓取淘寶‘美食'上面的所有食品信息
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
#encoding:utf8 import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from config import * import pymongo client = pymongo.MongoClient(MONGODB_URL) db = client[MONGODB_DB] ##這里使用PhantomJS,并配置了一些參數(shù) browser = webdriver.PhantomJS(service_args = SERVICE_ArGS) ##窗口的大小,不設(shè)置的話,默認(rèn)太小,會(huì)有問題 browser.set_window_size( 1400 , 900 ) wait = WebDriverWait(browser, 10 ) def search(): print ( '正在搜索' ) ##容易出現(xiàn)超時(shí)的錯(cuò)誤 try : ##等待這兩個(gè)模塊都加載好 browser.get( "https://www.taobao.com" ) input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#q' )) ) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button' )) ) ######這塊python2搞得鬼 #input.send_keys('\u7f8e\u98df'.decode("unicode-escape")) input .send_keys(KEYWORD.decode( "unicode-escape" )) submit.click() total = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total' )) ) get_product() return total.text except TimeoutException: return search() def next_page(page_number): print ( '翻頁(yè)' + str (page_number)) try : input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input' )) ) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit' )) ) input .clear() input .send_keys(page_number) submit.click() ##判斷是否翻頁(yè)成功 高亮的是不是輸入的值,直接加在后面即可 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span' ), str (page_number))) get_product() except TimeoutException: return next_page(page_number) #獲取產(chǎn)品信息 def get_product(): products = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .m-itemlist .items' )) ) ##拿到網(wǎng)頁(yè) html = browser.page_source soup = BeautifulSoup(html, 'lxml' ) items = soup.select( '#mainsrp-itemlist .m-itemlist .items .item.J_MouserOnverReq' )# print ( '*************************到此*************' ) for item in items: img = item.select( '.J_ItemPic.img' )[ 0 ].get( 'src' ) price = item.select( '.price.g_price.g_price-highlight > strong' )[ 0 ].get_text() deal = item.select( '.deal-cnt' )[ 0 ].get_text() title = item.select( '.row.row-2.title > a ' )[ 0 ].get_text().strip() #:nth-of-type(3) shop = item.select( '.row.row-3.g-clearfix > .shop > a > span:nth-of-type(2)' )[ 0 ].get_text() location = item.select( '.location' )[ 0 ].get_text() product = { 'img' :img, 'price' :price, 'deal' :deal, 'title' :title, 'shop' :shop, 'location' :location } #打印一下 import json j = json.dumps(product) dict2 = j.decode( "unicode-escape" ) print dict2 save_to_mongo(product) def save_to_mongo(product): try : if db[MONGODB_TABLE].insert(product): print ( '存儲(chǔ)到mongodb成功' + str (product)) except Exception: print ( "存儲(chǔ)到mongodb失敗" + str (product)) def main(): try : total = search() ##搜尋 re正則表達(dá)式 s = re. compile ( '(\d+)' ) total = int (s.search(total).group( 1 )) for i in range ( 2 ,total + 1 ): next_page(i) except Exception: print ( '出錯(cuò)' ) finally : browser.close() if __name__ = = '__main__' : main() |
config.py
1
2
3
4
5
6
7
8
|
MONGODB_URL = 'localhost' MONGODB_DB = 'taobao' MONGODB_TABLE = 'meishi' SERVICE_ArGS = [ '--load-images=false' , '--disk-cache=true' ] ##就是美食這兩個(gè)字,直接用漢字會(huì)報(bào)錯(cuò) KEYWORD = '\u7f8e\u98df' |
以上這篇Python使用Selenium爬取淘寶異步加載的數(shù)據(jù)方法就是小編分享給大家的全部?jī)?nèi)容了,希望能給大家一個(gè)參考,也希望大家多多支持服務(wù)器之家。
原文鏈接:https://blog.csdn.net/wqh_jingsong/article/details/66472106