1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
|
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.stream.Collectors;
public class Main {
private static Connection connection;
public static void main(String[] args) throws IOException, SQLException {
setConnection();
while (!selectUnHandledUrls().isEmpty()) {
String url = removeUnHandledMaxUrl();
if (url == null) break;
if (selectInHandledWhereUrl(url)) continue;
if (isWantTo(url)) {
Document jsoup = parseHtmlToHrefAndInsertIntoLinksUnHandled(url);
jsoupCssSelectAirtcleAndParseToInsertIntoNews(jsoup, url);
updateUrlWithSql(url, "INSERT INTO LINKS_IN_HANDLED(link) VALUES ( ? )");
}
}
}
/**
* Load links from databases
*
* @return ArrayList
* @throws SQLException SQLException
*/
private static ArrayList<String> selectUnHandledUrls() throws SQLException {
ArrayList<String> list = new ArrayList<>();
try (PreparedStatement preparedStatement = connection.prepareStatement("SELECT link FROM LINKS_UN_HANDLED");
ResultSet resultSet = preparedStatement.executeQuery()) {
while (resultSet.next()) {
list.add(resultSet.getString(1));
}
return list;
}
}
/**
* DELETE FROM LINKS_UN_HANDLED WHERE link = (SELECT max(id) FROM LINKS_UN_HANDLED)
*
* @return return deleted link
* @throws SQLException SQLException
*/
private static String removeUnHandledMaxUrl() throws SQLException {
try (PreparedStatement preparedStatement = connection.prepareStatement(
"SELECT * FROM LINKS_UN_HANDLED WHERE id=(SELECT max(id) FROM LINKS_UN_HANDLED)");
ResultSet resultSet = preparedStatement.executeQuery()) {
if (resultSet.next()) {
String url = resultSet.getString(2);
updateUrlWithSql(url, "DELETE FROM LINKS_UN_HANDLED WHERE link = ?");
return url;
}
return null;
}
}
/**
* SELECT link FROM LINKS_IN_HANDLED WHERE link = ?
*
* @param url http or https link
* @return if exists return true, else return false
* @throws SQLException SQLException
*/
private static boolean selectInHandledWhereUrl(String url) throws SQLException {
try (PreparedStatement preparedStatement
= connection.prepareStatement("SELECT link FROM LINKS_IN_HANDLED WHERE link = ?")) {
preparedStatement.setString(1, url);
try (ResultSet resultSet = preparedStatement.executeQuery()) {
return resultSet.next();
}
}
}
/**
* parse html to a tag, and 'INSERT INTO LINKS_UN_HANDLED'
*
* @param url http or https link
* @return Jsoup Document
* @throws SQLException SQLException
* @throws IOException IOException
*/
private static Document parseHtmlToHrefAndInsertIntoLinksUnHandled(String url) throws SQLException, IOException {
Document jsoup = Jsoup.parse(httpGetHTML(url));
for (Element a : jsoup.select("a")) {
String href = a.attr("href");
if (href.toLowerCase().startsWith("http")) {
if (isWantTo(href)) {
updateUrlWithSql(href, "INSERT INTO LINKS_UN_HANDLED(link) VALUES ( ? )");
}
}
}
return jsoup;
}
/**
* executeUpdate for link
*
* @param link http or https link
* @param sql sql
* @throws SQLException SQLException
*/
private static void updateUrlWithSql(String link, String sql) throws SQLException {
try (PreparedStatement preparedStatement
= connection.prepareStatement(sql)) {
preparedStatement.setString(1, link);
preparedStatement.executeUpdate();
}
}
/**
* Get HTML content
*
* @param link http or https link
* @return html body
* @throws IOException IOException
*/
private static String httpGetHTML(String link) throws IOException {
if (link.startsWith("//")) {
link = "https:" + link;
}
CloseableHttpClient document = HttpClients.createDefault();
HttpGet request = new HttpGet(link);
request.setHeader("User-Agent",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36");
try (CloseableHttpResponse response = document.execute(request)) {
return EntityUtils.toString(response.getEntity());
}
}
/**
* Save news to database
*
* @param jsoup Jsoup parse
*/
private static void jsoupCssSelectAirtcleAndParseToInsertIntoNews(Document jsoup, String url) throws SQLException {
Elements articles = jsoup.select("article");
if (!articles.isEmpty()) {
for (Element article : articles) {
String title = article.select("h1.art_tit_h1")
.stream().map(Element::text).collect(Collectors.joining(","));
String content = article.select("p.art_p")
.stream().map(Element::text).collect(Collectors.joining("\n"));
try (PreparedStatement preparedStatement
= connection.prepareStatement("INSERT INTO NEWS(title, content, url) VALUES ( ?, ?, ? )")) {
preparedStatement.setString(1, title);
preparedStatement.setString(2, content);
preparedStatement.setString(3, url);
preparedStatement.executeUpdate();
}
System.out.println(url);
System.out.println(title);
}
}
}
/**
* Determine whether it is the desired link
*
* @param link http or https link
* @return true or false
*/
private static boolean isWantTo(String link) {
return isNotLogin(link) && (isIndex(link) || isNews(link));
}
/**
* Determine whether it is the home page
*
* @param link http or https link
* @return true or false
*/
private static boolean isIndex(String link) {
return "https://sina.cn".equals(link);
}
/**
* Determine whether it is a login page
*
* @param link http or https link
* @return true or false
*/
private static boolean isNotLogin(String link) {
return !link.contains("passport.sina.cn");
}
/**
* Determine whether it is a news page
*
* @param link http or https link
* @return true or false
*/
private static boolean isNews(String link) {
return link.contains("news.sina.cn");
}
/**
* Set database connection
*
* @throws SQLException SQLException
*/
public static void setConnection() throws SQLException {
File projectDir = new File(System.getProperty("basedir", System.getProperty("user.dir")));
String jdbcUrl = "jdbc:h2:file:" + new File(projectDir, "news").getAbsolutePath();
connection = DriverManager.getConnection(jdbcUrl, "root", "toor");
}
}
|