前段時間需要爬取網(wǎng)頁上的信息,自己對于爬蟲沒有任何了解,就了解了一下webmagic,寫了個簡單的爬蟲。
一、首先介紹一下webmagic:
webmagic采用完全模塊化的設(shè)計,功能覆蓋整個爬蟲的生命周期(鏈接提取、頁面下載、內(nèi)容抽取、持久化),支持多線程抓取,分布式抓取,并支持自動重試、自定義ua/cookie等功能。
實現(xiàn)理念:
maven依賴:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
<dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-core</artifactid> <version> 0.7 . 3 </version> </dependency> <dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-extension</artifactid> <version> 0.7 . 3 </version> </dependency> <dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-extension</artifactid> <version> 0.7 . 3 </version> <exclusions> <exclusion> <groupid>org.slf4j</groupid> <artifactid>slf4j-log4j12</artifactid> </exclusion> </exclusions> </dependency> |
jdbc模式:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
ublic class csdnblogdao { private connection conn = null ; private statement stmt = null ; public csdnblogdao() { try { string url = "jdbc:mysql://localhost:3306/test?" + "user=***&password=***3&useunicode=true&characterencoding=utf8" ; conn = drivermanager.getconnection(url); stmt = conn.createstatement(); } catch (classnotfoundexception e) { e.printstacktrace(); } catch (sqlexception e) { e.printstacktrace(); } } public int add(csdnblog csdnblog) { try { string sql = "insert into `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) values (?, ?, ?, ?, ?, ?, ?, ?,?);" ; preparedstatement ps = conn.preparestatement(sql); ps.setint( 1 , csdnblog.getkey()); ps.setstring( 2 , csdnblog.gettitle()); ps.setstring( 3 ,csdnblog.getcontent()); ps.setstring( 4 , csdnblog.getdates()); ps.setstring( 5 , csdnblog.gettags()); ps.setstring( 6 , csdnblog.getcategory()); ps.setint( 7 , csdnblog.getview()); ps.setint( 8 , csdnblog.getcomments()); ps.setint( 9 , csdnblog.getcopyright()); return ps.executeupdate(); } catch (sqlexception e) { e.printstacktrace(); } return - 1 ; } } |
實體類:
- public class csdnblog {
- private int key;// 編號
- private string title;// 標(biāo)題
- private string dates;// 日期
- private string tags;// 標(biāo)簽
- private string category;// 分類
- private int view;// 閱讀人數(shù)
- private int comments;// 評論人數(shù)
- private int copyright;// 是否原創(chuàng)
- private string content; //文字內(nèi)容
- public string getcontent() {
- return content;
- }
- public void setcontent(string content) {
- this.content = content;
- }
- public int getkey() {
- return key;
- }
- public void setkey(int key) {
- this.key = key;
- }
- public string gettitle() {
- return title;
- }
- public void settitle(string title) {
- this.title = title;
- }
- public string getdates() {
- return dates;
- }
- public void setdates(string dates) {
- this.dates = dates;
- }
- public string gettags() {
- return tags;
- }
- public void settags(string tags) {
- this.tags = tags;
- }
- public string getcategory() {
- return category;
- }
- public void setcategory(string category) {
- this.category = category;
- }
- public int getview() {
- return view;
- }
- public void setview(int view) {
- this.view = view;
- }
- public int getcomments() {
- return comments;
- }
- public void setcomments(int comments) {
- this.comments = comments;
- }
- public int getcopyright() {
- return copyright;
- }
- public void setcopyright(int copyright) {
- this.copyright = copyright;
- }
- public string tostring() {
- return "csdnblog [key=" + key + ", title=" + title + ", content=" + content + ",dates=" + dates + ", tags=" + tags + ", category="
- + category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]";
- }
- }
啟動類:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
public class csdnblogpageprocessor implements pageprocessor { private static string username= "chenyufeng1991" ; // 設(shè)置csdn用戶名 private static int size = 0 ; // 共抓取到的文章數(shù)量 // 抓取網(wǎng)站的相關(guān)配置,包括:編碼、抓取間隔、重試次數(shù)等 private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 ); public site getsite() { return site; } // process是定制爬蟲邏輯的核心接口,在這里編寫抽取邏輯 public void process(page page) { // 列表頁 if (!page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/d+" ).match()) { // 添加所有文章頁 page.addtargetrequests(page.gethtml().xpath( "//div[@id='article_list']" ).links()// 限定文章列表獲取區(qū)域 .regex( "/" + username + "/article/details/d+" ) .replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替換給把相對url轉(zhuǎn)換成絕對url .all()); // 添加其他列表頁 page.addtargetrequests(page.gethtml().xpath( "//div[@id='papelist']" ).links()// 限定其他列表頁獲取區(qū)域 .regex( "/" + username + "/article/list/d+" ) .replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替換給把相對url轉(zhuǎn)換成絕對url .all()); // 文章頁 } else { size++; // 文章數(shù)量加1 // 用csdnblog類來存抓取到的數(shù)據(jù),方便存入數(shù)據(jù)庫 csdnblog csdnblog = new csdnblog(); // 設(shè)置編號 csdnblog.setkey(integer.parseint( page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/(d+)" ).get())); // 設(shè)置標(biāo)題 csdnblog.settitle( page.gethtml().xpath( "//div[@class='article_title']//span[@class='link_title']/a/text()" ).get()); //設(shè)置內(nèi)容 csdnblog.setcontent( page.gethtml().xpath( "//div[@class='article_content']/alltext()" ).get()); // 設(shè)置日期 csdnblog.setdates( page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_postdate']/text()" ).get()); // 設(shè)置標(biāo)簽(可以有多個,用,來分割) csdnblog.settags(listtostring(page.gethtml().xpath( "//div[@class='article_l']/span[@class='link_categories']/a/alltext()" ).all())); // 設(shè)置類別(可以有多個,用,來分割) csdnblog.setcategory(listtostring(page.gethtml().xpath( "//div[@class='category_r']/label/span/text()" ).all())); // 設(shè)置閱讀人數(shù) csdnblog.setview(integer.parseint(page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_view']" ) .regex( "(d+)人閱讀" ).get())); // 設(shè)置評論人數(shù) csdnblog.setcomments(integer.parseint(page.gethtml() .xpath( "//div[@class='article_r']/span[@class='link_comments']" ).regex( "((d+))" ).get())); // 設(shè)置是否原創(chuàng) csdnblog.setcopyright(page.gethtml().regex( "bog_copyright" ).match() ? 1 : 0 ); // 把對象存入數(shù)據(jù)庫 new csdnblogdao().add(csdnblog); // 把對象輸出控制臺 system.out.println(csdnblog); } } // 把list轉(zhuǎn)換為string,用,分割 public static string listtostring(list<string> stringlist) { if (stringlist == null ) { return null ; } stringbuilder result = new stringbuilder(); boolean flag = false ; for (string string : stringlist) { if (flag) { result.append( "," ); } else { flag = true ; } result.append(string); } return result.tostring(); } public static void main(string[] args) { long starttime, endtime; system.out.println( "【爬蟲開始】..." ); starttime = system.currenttimemillis(); // 從用戶博客首頁開始抓,開啟5個線程,啟動爬蟲 spider.create( new csdnblogpageprocessor()).addurl( "http://blog.csdn.net/" + username).thread( 5 ).run(); endtime = system.currenttimemillis(); system.out.println( "【爬蟲結(jié)束】共抓取" + size + "篇文章,耗時約" + ((endtime - starttime) / 1000 ) + "秒,已保存到數(shù)據(jù)庫,請查收!" ); } } |
使用mysql類型:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
public class gamepageprocessor implements pageprocessor { private static final logger logger = loggerfactory.getlogger(gamepageprocessor. class ); private static dianjingservice d; private static bannerservice bs; private static sportservice ss; private static yulenewsservice ys; private static updateservice ud ; // 抓取網(wǎng)站的相關(guān)配置,包括:編碼、抓取間隔、重試次數(shù)等 private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 ); public site getsite() { return site; } // process是定制爬蟲邏輯的核心接口,在這里編寫抽取邏輯 public static void main(string[] args) { configurableapplicationcontext context= springapplication.run(gamepageprocessor. class , args); d = context.getbean(dianjingservice. class ); //spider.create(new gamepageprocessor()).addurl("網(wǎng)址").thread(5).run(); } public void process(page page) { selectable url = page.geturl(); if (url.tostring().equals( "網(wǎng)址" )) { dianjingvideo dv = new dianjingvideo(); list<string> ls = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()" ).all(); //hrefs list<string> ls1 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-link']/a/@href" ).all();//獲取a標(biāo)簽的href list<string> ls2 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()" ).all(); //photo list<string> ls3 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-thumb']/img/@src" ).all(); for ( int i = 0 ; i < 5 ; i++) { dv.settitles(ls.get(i)); dv.setcategory( "" ); dv.setdates(ls2.get(i)); dv.sethrefs(ls1.get(i)); dv.setphoto(ls3.get(i)); dv.setsources( "" ); d.addvideo(dv); } } } |
controller:
- @controller
- @requestmapping(value = "/dianjing")
- public class dianjingcontroller {
- @autowired
- private dianjingservice s;
- /*
- 手游
- */
- @requestmapping("/dianjing")
- @responsebody
- public object dianjing(){
- list<dianjing> list = s.find2();
- jsonobject jo = new jsonobject();
- if(list!=null){
- jo.put("code",0);
- jo.put("success",true);
- jo.put("count",list.size());
- jo.put("list",list);
- }
- return jo;
- }
- }
實體類就不展示了
dao層
1
2
|
@insert ( "insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})" ) int adddj(dianjing dj); |
以上這篇springboot+webmagic實現(xiàn)java爬蟲jdbc及mysql的方法就是小編分享給大家的全部內(nèi)容了,希望能給大家一個參考,也希望大家多多支持服務(wù)器之家。
原文鏈接:https://www.cnblogs.com/NCL--/p/8608336.html