存入MongoDB
1.啟動MongoDB數據庫:sudo mongod
2.執行下面程序:py2 process_youyuan_mongodb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
# process_youyuan_mongodb.py # -*- coding: utf-8 -*- import json import redis import pymongo def main(): # 指定Redis數據庫信息 rediscli = redis.StrictRedis(host= '192.168.199.108' , port=6379, db=0) # 指定MongoDB數據庫信息 mongocli = pymongo.MongoClient(host= 'localhost' , port=27017) # 創建數據庫名 db = mongocli[ 'youyuan' ] # 創建表名 sheet = db[ 'beijing_18_25' ] while True : # FIFO模式為 blpop,LIFO模式為 brpop,獲取鍵值 source, data = rediscli.blpop([ "youyuan:items" ]) item = json.loads(data) sheet. insert (item) try: print u "Processing: %(name)s <%(link)s>" % item except KeyError: print u "Error procesing: %r" % item if __name__ == '__main__' : main() |
存入 MySQL
1.啟動mysql:mysql.server start(更平臺不一樣)
2.登錄到root用戶:mysql -uroot -p
3.創建數據庫youyuan:create database youyuan;
4.切換到指定數據庫:use youyuan
5.創建表beijing_18_25以及所有字段的列名和數據類型。
6.執行下面程序:py2 process_youyuan_mysql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
#process_youyuan_mysql.py # -*- coding: utf-8 -*- import json import redis import MySQLdb def main(): # 指定redis數據庫信息 rediscli = redis.StrictRedis(host= '192.168.199.108' , port = 6379, db = 0) # 指定mysql數據庫 mysqlcli = MySQLdb. connect (host= '127.0.0.1' , user = 'power' , passwd= 'xxxxxxx' , db = 'youyuan' , port=3306, use_unicode= True ) while True : # FIFO模式為 blpop,LIFO模式為 brpop,獲取鍵值 source, data = rediscli.blpop([ "youyuan:items" ]) item = json.loads(data) try: # 使用 cursor ()方法獲取操作游標 cur = mysqlcli. cursor () # 使用 execute 方法執行SQL INSERT 語句 cur. execute ( "INSERT INTO beijing_18_25 (username, crawled, age, spider, header_url, source, pic_urls, monologue, source_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s )" , [item[ 'username' ], item[ 'crawled' ], item[ 'age' ], item[ 'spider' ], item[ 'header_url' ], item[ 'source' ], item[ 'pic_urls' ], item[ 'monologue' ], item[ 'source_url' ]]) # 提交sql事務 mysqlcli. commit () #關閉本次操作 cur. close () print "inserted %s" % item[ 'source_url' ] except MySQLdb.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) if __name__ == '__main__' : main() |
總結
以上所述是小編給大家介紹的分布式爬蟲處理Redis里的數據操作步驟,希望對大家有所幫助,如果大家有任何疑問歡迎給我留言,小編會及時回復大家的!