爬虫初体验(爬了牛客)

起初不知为何有个路径为何抛FileNotFound异常,后来此异常有无缘无故消失了,很奇怪,代码写的很渣,不过总归是爬到了,代码如下

主程序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
package socket;

import java.io.*;
import java.net.*;
import java.util.*;
public class SocketTest
{
public static void main(String[] args) throws IOException
{//13
try {
URL url=new URL("https://www.nowcoder.com/school/schedule");
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoInput(true);
connection.setRequestProperty("Host", "www.nowcoder.com");
connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
connection.setRequestProperty("path", "/recommend");
connection.setRequestProperty("authority", "www.nowcoder.com");
connection.setRequestProperty("scheme", "https");
connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
//connection.setRequestProperty("accept-encoding", "gzip, deflate, br");
//connection.setRequestProperty("referer", "https://www.nowcoder.com/ta/sql");
connection.setRequestProperty("method", "GET");
//connection.setRequestProperty("accept-language", "zh-CN,zh;q=0.9,en;q=0.8");
connection.connect();
Scanner in = new Scanner(connection.getInputStream(), "UTF-8");
String str1="<li class=\" js-company-item\" data-id=\"665\">";
String regex1="<li class=\" js-company-item\".+";//匹配最开始的li标签
String str2="<h2>字节跳动</h2>";
String regex2="(<h2>)|(</h2>)";//匹配之后出现的公司名
String str3="<div class=\"act-company-info resume\">网申<span class=\"act-company-time\">8月16日起</span></div>";
String regex3="(<div class=\"act-company-info resume\">网申<span class=\"act-company-time\">)|(</span></div>)";
//匹配最后出现的原始日期字符串
List<Recruitment> list=new ArrayList<>();
List<Recruitment> other=new ArrayList<>();
Collection<Integer> lengths=new TreeSet<>();
while (in.hasNextLine())
{
String line = in.nextLine();
if(line.matches(regex1)){//判断是否出现指定的li,如出现则进入内层循环
while(in.hasNextLine()){
String hline=in.nextLine();
String cname=null;
String origindate=null;
//System.out.println(hline);
if(hline.matches("<h2>.+</h2>")){
cname=hline.replaceAll(regex2, "").trim();//提取出现的公司名
while(in.hasNextLine()){
String dline=in.nextLine();
if(dline.matches("<div class=\"act-company-info resume\">网申<span class=\"act-company-time\">.*</span></div>")){
origindate=dline.replaceAll(regex3, "").trim();
if(origindate!=null &&!"".equals(origindate)){//防止匹配上空值
if(origindate.contains("-")){
String [] strs= origindate.split("-+")[1].split("(\\D)+");//得到结束日期字符串数组
list.add(new Recruitment(cname,formString(strs,origindate),origindate));
}else if(origindate.endsWith("截止")){
String [] strs= origindate.split("(\\D)+");
list.add(new Recruitment(cname,formString(strs,origindate),origindate));
}else{
other.add(new Recruitment(cname,origindate));
}

lengths.add(origindate.length());
/*if(origindate.length()==2){
System.out.printf("%-20s%10s\n",cname,origindate);

//System.out.println(origindate.split("(\\D)+").length);
}*/
}
break;
}
}

break;
}
}
}
//System.out.println(line);
}

Collections.sort(list);

for (Recruitment recruitment : list) {

System.out.println(recruitment);
}
for (Recruitment recruitment : other) {
System.out.println(recruitment);
}
System.out.println(list.size());
System.out.println(other.size());
System.out.println(lengths);

}catch(Exception e){
System.out.println(e);
}

}
public static Date formString(String[] strs,String ostr){
Date date=null;
if(strs.length==1){
if(ostr.endsWith("下旬")|| ostr.endsWith("底")){
date=new GregorianCalendar(119+1900,Integer.parseInt(strs[0])-1,29).getTime();
}else if(ostr.endsWith("中旬")){
date=new GregorianCalendar(119+1900,Integer.parseInt(strs[0])-1,15).getTime();
}else if(ostr.endsWith("上旬")||ostr.endsWith("初")){
date=new GregorianCalendar(119+1900,Integer.parseInt(strs[0])-1,5).getTime();
}else{
date=new GregorianCalendar(119+1900,Integer.parseInt(strs[0])-1,1).getTime();
}
}else if(strs.length==2||strs.length==4){
date=new GregorianCalendar(119+1900,Integer.parseInt(strs[0])-1,Integer.parseInt(strs[1])).getTime();
}else if(Integer.parseInt(strs[0])>12&&strs.length==3){
//System.out.println(Arrays.toString(strs));
date=new GregorianCalendar(119+1900,Integer.parseInt(strs[1])-1,Integer.parseInt(strs[2])).getTime();
}else{
date=new Date();
}
return date;
}
}

相关类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
package socket;

import java.text.SimpleDateFormat;
import java.util.Date;

public class Recruitment implements Comparable<Recruitment>{
private String name;
private Date edate;//结束日期
private String origindate;
public String getName() {
return name;
}
public Date getEdate() {
return edate;
}
public String getOrigindate() {
return origindate;
}
public void setName(String name) {
this.name = name;
}
public void setEdate(Date edate) {
this.edate = edate;
}
public void setOrigindate(String origindate) {
this.origindate = origindate;
}
public Recruitment(String name, Date edate, String origindate) {
super();
this.name = name;
this.edate = edate;
this.origindate = origindate;
}
public Recruitment(String name, String origindate) {
super();
this.name = name;
this.origindate = origindate;
}
@Override
public int compareTo(Recruitment o) {
// TODO Auto-generated method stub
return this.edate.compareTo(o.getEdate());
}
@Override
public String toString() {
SimpleDateFormat pat=new SimpleDateFormat("yyyy 年 MM 月 dd 日");
if(this.edate!=null)
return "Recruitment [name=" + name + ", edate=" + pat.format(edate) + ", origindate=" + origindate ;
else
return "Recruitment [name=" + name +", origindate=" + origindate ;
}

}

运行结果:

总结:网络编程的知识之前学到过一些,不过很浅显,这次自己动手加深了印象(以后查牛客也更方便了些23333),问题主要在对java网络类的底层实现不太了解,自己网络的基础比较薄弱,还有正则的验证也是个问题,想要涵盖所有的情况,提取相关信息,比较难(套了三层循环,不知是否有相关的简化办法?)

Donate comment here