WebMagic是一個開源爬蟲框架,本項目通過在SpringBoot項目中使用WebMagic去抓取數據,最后使用MyBatis將數據入庫。
本項目代碼地址:ArticleCrawler: SrpingBoot+WebMagic+MyBaties實現爬蟲和數據入庫 (gitee.com)
創建數據庫:
本示例中庫名為article,表名為cms_content,表中包含contentId、title、date三個字段。
1
2
3
4
5
6
|
CREATE TABLE `cms_content` ( `contentId` varchar (40) NOT NULL COMMENT '內容ID' , `title` varchar (150) NOT NULL COMMENT '標題' , ` date ` varchar (150) NOT NULL COMMENT '發布日期' , PRIMARY KEY (`contentId`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT= 'CMS內容表' ; |
新建SpringBoot項目:
1、配置依賴pom.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
<? xml version = "1.0" encoding = "UTF-8" ?> < project xmlns = "http://maven.apache.org/POM/4.0.0" xmlns:xsi = "http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation = "http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd" > < modelVersion >4.0.0</ modelVersion > < parent > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-starter-parent</ artifactId > < version >2.5.5</ version > < relativePath /> </ parent > < groupId >com.example</ groupId > < artifactId >Article</ artifactId > < version >0.0.1-SNAPSHOT</ version > < name >Article</ name > < description >Article</ description > < properties > < java.version >1.8</ java.version > < project.build.sourceEncoding >UTF-8</ project.build.sourceEncoding > < maven.test.skip >true</ maven.test.skip > < maven.compiler.plugin.version >3.8.1</ maven.compiler.plugin.version > < maven.resources.plugin.version >3.1.0</ maven.resources.plugin.version > < mysql.connector.version >5.1.47</ mysql.connector.version > < druid.spring.boot.starter.version >1.1.17</ druid.spring.boot.starter.version > < mybatis.spring.boot.starter.version >1.3.4</ mybatis.spring.boot.starter.version > < fastjson.version >1.2.58</ fastjson.version > < commons.lang3.version >3.9</ commons.lang3.version > < joda.time.version >2.10.2</ joda.time.version > < webmagic.core.version >0.7.5</ webmagic.core.version > </ properties > < dependencies > < dependency > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-starter-web</ artifactId > </ dependency > < dependency > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-starter-test</ artifactId > < scope >test</ scope > </ dependency > < dependency > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-configuration-processor</ artifactId > < optional >true</ optional > </ dependency > < dependency > < groupId >mysql</ groupId > < artifactId >mysql-connector-java</ artifactId > < version >${mysql.connector.version}</ version > </ dependency > < dependency > < groupId >com.alibaba</ groupId > < artifactId >druid-spring-boot-starter</ artifactId > < version >${druid.spring.boot.starter.version}</ version > </ dependency > < dependency > < groupId >org.mybatis.spring.boot</ groupId > < artifactId >mybatis-spring-boot-starter</ artifactId > < version >${mybatis.spring.boot.starter.version}</ version > </ dependency > < dependency > < groupId >com.alibaba</ groupId > < artifactId >fastjson</ artifactId > < version >${fastjson.version}</ version > </ dependency > < dependency > < groupId >org.apache.commons</ groupId > < artifactId >commons-lang3</ artifactId > < version >${commons.lang3.version}</ version > </ dependency > < dependency > < groupId >joda-time</ groupId > < artifactId >joda-time</ artifactId > < version >${joda.time.version}</ version > </ dependency > < dependency > < groupId >us.codecraft</ groupId > < artifactId >webmagic-core</ artifactId > < version >${webmagic.core.version}</ version > < exclusions > < exclusion > < groupId >org.slf4j</ groupId > < artifactId >slf4j-log4j12</ artifactId > </ exclusion > </ exclusions > </ dependency > </ dependencies > < build > < plugins > < plugin > < groupId >org.apache.maven.plugins</ groupId > < artifactId >maven-compiler-plugin</ artifactId > < version >${maven.compiler.plugin.version}</ version > < configuration > < source >${java.version}</ source > < target >${java.version}</ target > < encoding >${project.build.sourceEncoding}</ encoding > </ configuration > </ plugin > < plugin > < groupId >org.apache.maven.plugins</ groupId > < artifactId >maven-resources-plugin</ artifactId > < version >${maven.resources.plugin.version}</ version > < configuration > < encoding >${project.build.sourceEncoding}</ encoding > </ configuration > </ plugin > < plugin > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-maven-plugin</ artifactId > < configuration > < fork >true</ fork > < addResources >true</ addResources > </ configuration > < executions > < execution > < goals > < goal >repackage</ goal > </ goals > </ execution > </ executions > </ plugin > </ plugins > </ build > < repositories > < repository > < id >public</ id > < name >aliyun nexus</ name > < url >http://maven.aliyun.com/nexus/content/groups/public/</ url > < releases > < enabled >true</ enabled > </ releases > </ repository > </ repositories > < pluginRepositories > < pluginRepository > < id >public</ id > < name >aliyun nexus</ name > < url >http://maven.aliyun.com/nexus/content/groups/public/</ url > < releases > < enabled >true</ enabled > </ releases > < snapshots > < enabled >false</ enabled > </ snapshots > </ pluginRepository > </ pluginRepositories > </ project > |
2、創建CmsContentPO.java
數據實體,和表中3個字段對應。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
package site.exciter.article.model; public class CmsContentPO { private String contentId; private String title; private String date; public String getContentId() { return contentId; } public void setContentId(String contentId) { this .contentId = contentId; } public String getTitle() { return title; } public void setTitle(String title) { this .title = title; } public String getDate() { return date; } public void setDate(String date) { this .date = date; } } |
3、創建CrawlerMapper.java
1
2
3
4
5
6
7
8
9
|
package site.exciter.article.dao; import org.apache.ibatis.annotations.Mapper; import site.exciter.article.model.CmsContentPO; @Mapper public interface CrawlerMapper { int addCmsContent(CmsContentPO record); } |
4、配置映射文件CrawlerMapper.xml
在resources下新建mapper文件夾,在mapper下創建CrawlerMapper.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
|
<? xml version = "1.0" encoding = "UTF-8" ?> <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> < mapper namespace = "site.exciter.article.dao.CrawlerMapper" > < insert id = "addCmsContent" parameterType = "site.exciter.article.model.CmsContentPO" > insert into cms_content (contentId, title, date) values (#{contentId,jdbcType=VARCHAR}, #{title,jdbcType=VARCHAR}, #{date,jdbcType=VARCHAR}) </ insert > </ mapper > |
5、配置application.properties
配置數據庫和mybatis映射關系。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
# mysql spring.datasource.name=mysql spring.datasource.type=com.alibaba.druid.pool.DruidDataSource spring.datasource.driver-class-name=com.mysql.jdbc.Driver spring.datasource.url=jdbc:mysql://10.201.61.184:3306/article?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true spring.datasource.username=root spring.datasource.password=root # druid spring.datasource.druid.initial-size=5 spring.datasource.druid.min-idle=5 spring.datasource.druid.max-active=10 spring.datasource.druid.max-wait=60000 spring.datasource.druid.validation-query=SELECT 1 FROM DUAL spring.datasource.druid.test-on-borrow=false spring.datasource.druid.test-on-return=false spring.datasource.druid.test-while-idle=true spring.datasource.druid.time-between-eviction-runs-millis=60000 spring.datasource.druid.min-evictable-idle-time-millis=300000 spring.datasource.druid.max-evictable-idle-time-millis=600000 # mybatis mybatis.mapperLocations=classpath:mapper/CrawlerMapper.xml |
6、創建ArticlePageProcessor.java
解析html的邏輯。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
package site.exciter.article; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; @Component public class ArticlePageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes( 3 ).setSleepTime( 1000 ); @Override public void process(Page page) { String detail_urls_Xpath = "//*[@class='postTitle']/a[@class='postTitle2']/@href" ; String next_page_xpath = "//*[@id='nav_next_page']/a/@href" ; String next_page_css = "#homepage_top_pager > div:nth-child(1) > a:nth-child(7)" ; String title_xpath = "//h1[@class='postTitle']/a/span/text()" ; String date_xpath = "//span[@id='post-date']/text()" ; page.putField( "title" , page.getHtml().xpath(title_xpath).toString()); if (page.getResultItems().get( "title" ) == null ) { page.setSkip( true ); } page.putField( "date" , page.getHtml().xpath(date_xpath).toString()); if (page.getHtml().xpath(detail_urls_Xpath).match()) { Selectable detailUrls = page.getHtml().xpath(detail_urls_Xpath); page.addTargetRequests(detailUrls.all()); } if (page.getHtml().xpath(next_page_xpath).match()) { Selectable nextPageUrl = page.getHtml().xpath(next_page_xpath); page.addTargetRequests(nextPageUrl.all()); } else if (page.getHtml().css(next_page_css).match()) { Selectable nextPageUrl = page.getHtml().css(next_page_css).links(); page.addTargetRequests(nextPageUrl.all()); } } @Override public Site getSite() { return site; } } |
7、創建ArticlePipeline.java
處理數據的持久化。
- package site.exciter.article;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.stereotype.Component;
- import site.exciter.article.model.CmsContentPO;
- import site.exciter.article.dao.CrawlerMapper;
- import us.codecraft.webmagic.ResultItems;
- import us.codecraft.webmagic.Task;
- import us.codecraft.webmagic.pipeline.Pipeline;
- import java.util.UUID;
- @Component
- public class ArticlePipeline implements Pipeline {
- private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class);
- @Autowired
- private CrawlerMapper crawlerMapper;
- public void process(ResultItems resultItems, Task task) {
- String title = resultItems.get("title");
- String date = resultItems.get("date");
- CmsContentPO contentPO = new CmsContentPO();
- contentPO.setContentId(UUID.randomUUID().toString());
- contentPO.setTitle(title);
- contentPO.setDate(date);
- try {
- boolean success = crawlerMapper.addCmsContent(contentPO) > 0;
- LOGGER.info("保存成功:{}", title);
- } catch (Exception ex) {
- LOGGER.error("保存失敗", ex);
- }
- }
- }
8、創建ArticleTask.java
執行抓取任務。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
package site.exciter.article; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Spider; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @Component public class ArticleTask { private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline. class ); @Autowired private ArticlePipeline articlePipeline; @Autowired private ArticlePageProcessor articlePageProcessor; private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor(); public void crawl() { // 定時任務,每10分鐘爬取一次 timer.scheduleWithFixedDelay(() -> { Thread.currentThread().setName( "ArticleCrawlerThread" ); try { Spider.create(articlePageProcessor) .addUrl( "http://www.cnblogs.com/dick159/default.html?page=2" ) // 抓取到的數據存數據庫 .addPipeline(articlePipeline) // 開啟5個線程抓取 .thread( 5 ) // 異步啟動爬蟲 .start(); } catch (Exception ex) { LOGGER.error( "定時抓取數據線程執行異常" , ex); } }, 0 , 10 , TimeUnit.MINUTES); } } |
9、修改Application
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
package site.exciter.article; import org.mybatis.spring.annotation.MapperScan; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.CommandLineRunner; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; @SpringBootApplication @MapperScan (basePackages = "site.exciter.article.interface" ) public class ArticleApplication implements CommandLineRunner { @Autowired private ArticleTask articleTask; public static void main(String[] args) { SpringApplication.run(ArticleApplication. class , args); } @Override public void run(String... args) throws Exception { articleTask.crawl(); } } |
10、執行application,開始抓數據并入庫
到此這篇關于SrpingBoot+WebMagic+MyBaties實現爬蟲和數據入庫的示例的文章就介紹到這了,更多相關SrpingBoot+WebMagic+MyBaties爬蟲和數據入庫內容請搜索服務器之家以前的文章或繼續瀏覽下面的相關文章希望大家以后多多支持服務器之家!
原文鏈接:https://juejin.cn/post/7018897037219332104