【JAVA】:爬虫
усил の博客 2020/5/23 Java
# 用Java实现简单网页爬虫
# 一、前期准备
(1)maven工程创建 (2)httpcomponents 和 htmlcleaner jar包支持
# 二、技术介绍
(1)创建maven工程方便工程管理以及jar包的导入。 (2)httpcomponents.jar包用于爬取页面信息。 (3)htmlcleaner.jar包用于解析htmll页面信息。 (4)XPath支持。 (5)正则表达式支持
# 三、下载jar包
在maven工程中的pom.xml配置文件中添加以下配置信息:
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.4</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.10</version>
</dependency>
</dependencies>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 四、代码实现
步骤一:创建页面类(保存页面信息)
public class Page {
private String content;//页面内容
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
步骤二:具体实现类(工具类)
public class PageDownLoadUtil {
//页面下载
public static String getPageContent(String url){
HttpClientBuilder builder=HttpClients.custom();
CloseableHttpClient client=builder.build();
HttpGet request=new HttpGet(url);
String content=null;
try {
//获取响应
CloseableHttpResponse response=client.execute(request);
//获取响应内容
HttpEntity entity=response.getEntity();
//将响应内容转化成字符串
content=EntityUtils.toString(entity);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return content;
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
步骤三:定义俩个接口
//下载页面
public interface IDownLoaddao {
public Page downLoad(String url);
}
//解析页面
public interface Iprocessdao{
public void process(Page page);
}
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
步骤四:接口实现类
public class HttpClientDownLoadimpl implements IDownLoaddao{
@Override
public Page downLoad(String url) {
Page page=new Page();
page.setContent(PageDownLoadUtil.getPageContent(url));
return page;
}
public class Processimpl implements Iprocessdao {
@Override
public void process(Page page) {
//获取页面内容
String content=page.getContent();
HtmlCleaner htmlCleaner=new HtmlCleaner();
TagNode rootNode= htmlCleaner.clean(content);
//xPath表达式
String xPathExpression="//*/h4";
String data = null;
try {
//解析html
Object[]evaluateXPath=rootNode.evaluateXPath(xPathExpression);
if(evaluateXPath.length>0){//不为空
for(Object node:evaluateXPath){
//用正则表达式将字符串" "用空格替代
data=((TagNode) node).getText().toString().replaceAll(" ", " ");
System.out.println(data);
}
}
} catch (XPatherException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
步骤五:测试类
public class Test {
private IDownLoaddao DownLoaddao;//下载接口
private Iprocessdao processdao;//解析接口
public IDownLoaddao getDownLoaddao() {
return DownLoaddao;
}
public void setDownLoaddao(IDownLoaddao downLoaddao) {
DownLoaddao = downLoaddao;
}
public Iprocessdao getProcessdao() {
return processdao;
}
public void setProcessdao(Iprocessdao processdao) {
this.processdao = processdao;
}
//下载页面
public Page downloadPage(String url){
return this.DownLoaddao.downLoad(url);
}
//解析页面
public void processPage(Page page){
this.processdao.process(page);
}
public static void main(String[] args) {
Test test=new Test();
//传入下载页面接口
test.setDownLoadService(new HttpClientDownLoaddao());
//传入解析页面接口
test.setProcessService(new Processdao());
//页面url
String url="http://sece.nfu.edu.cn/get?id=93&mid=19";
//下载页面
Page page=test.downloadPage(url);
//页面输出
System.out.println(page.getContent());
//解析页面
test.processPage(page);
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# 爬取结果
(1)下载页面内容(部分)为 (2)解析内容(部分)