【JAVA】:爬虫

2020/5/23 Java

# 用Java实现简单网页爬虫

# 一、前期准备

(1)maven工程创建 (2)httpcomponents 和 htmlcleaner jar包支持

# 二、技术介绍

(1)创建maven工程方便工程管理以及jar包的导入。 (2)httpcomponents.jar包用于爬取页面信息。 (3)htmlcleaner.jar包用于解析htmll页面信息。 (4)XPath支持。 (5)正则表达式支持

# 三、下载jar包

在maven工程中的pom.xml配置文件中添加以下配置信息:

<dependencies>

       <dependency>
          <groupId>org.apache.httpcomponents</groupId>
          <artifactId>httpclient</artifactId>
          <version>4.4</version>
       </dependency>
       
       <dependency>
           <groupId>net.sourceforge.htmlcleaner</groupId>
           <artifactId>htmlcleaner</artifactId>
           <version>2.10</version>
       </dependency>
       
</dependencies>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

# 四、代码实现

     步骤一:创建页面类(保存页面信息)

public class Page {
   private String content;//页面内容
   public String getContent() {
		return content;
	}
   public void setContent(String content) {
		this.content = content;
	}
}
1
2
3
4
5
6
7
8
9

     步骤二:具体实现类(工具类)

public class PageDownLoadUtil {
//页面下载
	public static String getPageContent(String url){
		HttpClientBuilder builder=HttpClients.custom();
		CloseableHttpClient client=builder.build();
		HttpGet request=new HttpGet(url);
		String content=null;
		try {
		//获取响应
			CloseableHttpResponse response=client.execute(request);
		//获取响应内容
			HttpEntity entity=response.getEntity();
		//将响应内容转化成字符串
			content=EntityUtils.toString(entity);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return content;
	}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

     步骤三:定义俩个接口

//下载页面
public interface IDownLoaddao {
  public Page downLoad(String url);
}

//解析页面
public interface Iprocessdao{
    public void process(Page page);
}
1
2
3
4
5
6
7
8
9

     步骤四:接口实现类

public class HttpClientDownLoadimpl implements IDownLoaddao{

	@Override
	public Page downLoad(String url) {
		Page page=new Page();
		page.setContent(PageDownLoadUtil.getPageContent(url));
		return page;
	}
	
	public class Processimpl implements Iprocessdao {

	@Override
	public void process(Page page) {
	    //获取页面内容
		String content=page.getContent();
		HtmlCleaner htmlCleaner=new HtmlCleaner();
		TagNode rootNode= htmlCleaner.clean(content);
		//xPath表达式
		String xPathExpression="//*/h4";
		String data = null;
		try {
		//解析html
	    Object[]evaluateXPath=rootNode.evaluateXPath(xPathExpression);
	    
			if(evaluateXPath.length>0){//不为空
				for(Object node:evaluateXPath){
				//用正则表达式将字符串"&nbsp;"用空格替代
					data=((TagNode) node).getText().toString().replaceAll("&nbsp;", " ");
					System.out.println(data);	
				}	
			}
		} catch (XPatherException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

     步骤五:测试类

public class Test {
	private IDownLoaddao DownLoaddao;//下载接口
	private Iprocessdao processdao;//解析接口
	
   public IDownLoaddao getDownLoaddao() {
		return DownLoaddao;
	}

	public void setDownLoaddao(IDownLoaddao downLoaddao) {
		DownLoaddao = downLoaddao;
	}
	
	public Iprocessdao getProcessdao() {
		return processdao;
	}

	public void setProcessdao(Iprocessdao processdao) {
		this.processdao = processdao;
	}

	//下载页面
    public Page downloadPage(String url){
		return this.DownLoaddao.downLoad(url);	
    }
    //解析页面
    public void processPage(Page page){
       this.processdao.process(page);
    }
    
public static void main(String[] args) {
	   Test test=new Test();
	   //传入下载页面接口
	   test.setDownLoadService(new HttpClientDownLoaddao());
	   //传入解析页面接口
	   test.setProcessService(new Processdao());
	   //页面url
	   String url="http://sece.nfu.edu.cn/get?id=93&mid=19";
	   //下载页面
	   Page page=test.downloadPage(url);
	   //页面输出
	   System.out.println(page.getContent());
	   //解析页面
	   test.processPage(page);	   
     }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

# 爬取结果

     (1)下载页面内容(部分)为 在这里插入图片描述      (2)解析内容(部分)