|
|
|
package com.ruoyi.quartz.task.aquatic;
|
|
|
|
|
|
|
|
import com.ruoyi.common.utils.DateUtils;
|
|
|
|
import com.ruoyi.common.utils.StringUtils;
|
|
|
|
import com.ruoyi.system.domain.fish.FishAquaticPublicOpinion;
|
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import org.jsoup.nodes.Element;
|
|
|
|
import org.jsoup.select.Elements;
|
|
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.nio.file.Path;
|
|
|
|
import java.nio.file.Paths;
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
@Service
|
|
|
|
public class WwwMoaGovCn extends AquaticPublicOpinionBase{
|
|
|
|
/**
|
|
|
|
* 中华人民共和国农业农村部网站
|
|
|
|
* @param day
|
|
|
|
* @return
|
|
|
|
*/
|
|
|
|
@Override
|
|
|
|
public List<FishAquaticPublicOpinion> collect(String day)
|
|
|
|
{
|
|
|
|
//先下载规章
|
|
|
|
nyncbgzk();
|
|
|
|
List<FishAquaticPublicOpinion> list = govpublic(day);
|
|
|
|
return list;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
String getInfo(String info_url) {
|
|
|
|
try {
|
|
|
|
// 访问网页
|
|
|
|
Document doc = createDocument(info_url);
|
|
|
|
|
|
|
|
// 查找正文区域,常见class为 TRS_Editor 或 article
|
|
|
|
Element content = doc.selectFirst("div.gsj_htmlcon");
|
|
|
|
|
|
|
|
if (content != null) {
|
|
|
|
StringBuilder text = new StringBuilder();
|
|
|
|
|
|
|
|
// 遍历段落,保留换行
|
|
|
|
for (Element p : content.select("p")) {
|
|
|
|
String line = p.text().trim();
|
|
|
|
if (!line.isEmpty()) {
|
|
|
|
text.append(line).append("\n\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return text.toString();
|
|
|
|
} else {
|
|
|
|
logger.error("未找到正文内容区域:"+info_url);
|
|
|
|
}
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
logger.error("解析详情错误:"+info_url,e);
|
|
|
|
}
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
|
WwwMoaGovCn wwwMoaGovCn = new WwwMoaGovCn();
|
|
|
|
wwwMoaGovCn.nyncbgzk();
|
|
|
|
}
|
|
|
|
|
|
|
|
private List<FishAquaticPublicOpinion> govpublic(String day)
|
|
|
|
{
|
|
|
|
String domain = "https://www.moa.gov.cn/govpublic/";
|
|
|
|
List<FishAquaticPublicOpinion> list = new ArrayList<>();
|
|
|
|
try {
|
|
|
|
// 解析页面
|
|
|
|
Document doc = createDocument(domain);
|
|
|
|
// 选中 div.m_list 下的 ul 中的所有 li
|
|
|
|
Elements liElements = doc.select("ul.commonlist li");
|
|
|
|
// 打印所有 li 的 HTML
|
|
|
|
for (Element li : liElements) {
|
|
|
|
Elements a = li.select("a");
|
|
|
|
|
|
|
|
String url = a.attr("abs:href");
|
|
|
|
|
|
|
|
String title = a.attr("title");
|
|
|
|
|
|
|
|
String time = li.select("span").text();
|
|
|
|
if(StringUtils.isNotEmpty(day) && DateUtils.parseDate(time,DateUtils.YYYY_MM_DD).equals(DateUtils.parseDate(day,DateUtils.YYYY_MM_DD)))
|
|
|
|
{
|
|
|
|
FishAquaticPublicOpinion aquaticPublicOpinion = new FishAquaticPublicOpinion();
|
|
|
|
aquaticPublicOpinion.setTitle(title);
|
|
|
|
aquaticPublicOpinion.setInfoUrl(url);
|
|
|
|
aquaticPublicOpinion.setReleaseTime(DateUtils.parseDate(time,DateUtils.YYYY_MM_DD));
|
|
|
|
aquaticPublicOpinion.setCreateTime(new Date());
|
|
|
|
list.add(aquaticPublicOpinion);
|
|
|
|
}else {
|
|
|
|
return list;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
|
|
logger.error("数据解析错误:"+domain,e);
|
|
|
|
}
|
|
|
|
return list;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 农业农村部
|
|
|
|
* 规章
|
|
|
|
* @return
|
|
|
|
*/
|
|
|
|
private void nyncbgzk()
|
|
|
|
{
|
|
|
|
String firstUrl = "https://www.moa.gov.cn/gk/nyncbgzk/";
|
|
|
|
try {
|
|
|
|
// 解析页面
|
|
|
|
Document doc = createDocument(firstUrl);
|
|
|
|
// 选中 div.m_list 下的 ul 中的所有 li
|
|
|
|
Elements liElements = doc.select("div.gz_list ul li div.title");
|
|
|
|
File[] files = getDirFils(Paths.get("uploadPath/gz/").toAbsolutePath().toString());
|
|
|
|
for (Element li : liElements)
|
|
|
|
{
|
|
|
|
Elements a = li.select("a");
|
|
|
|
String url = a.attr("abs:href");
|
|
|
|
String title = a.text()+".txt";
|
|
|
|
if(isStringInTop20Files(files, title))
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Document info_doc = createDocument(url);
|
|
|
|
Elements content = info_doc.select("div.gz_content");
|
|
|
|
Path path = Paths.get("uploadPath/gz/"+title);
|
|
|
|
saveFile(content.text(),path);
|
|
|
|
}
|
|
|
|
} catch (IOException e) {
|
|
|
|
logger.error("数据解析错误:"+firstUrl,e);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 判断指定字符串str是否存在于指定目录中按修改时间排序的前20个文件名中
|
|
|
|
* @param files 指定目录路径
|
|
|
|
* @param str 要匹配的字符串
|
|
|
|
* @return 如果存在则返回true,否则返回false
|
|
|
|
*/
|
|
|
|
public static boolean isStringInTop20Files( File[] files , String str) {
|
|
|
|
|
|
|
|
// 取前20个文件名判断是否包含指定字符串
|
|
|
|
for (int i = 0; i < Math.min(20, files.length); i++) {
|
|
|
|
String fileName = files[i].getName();
|
|
|
|
if (fileName.contains(str)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static File[] getDirFils(String dirPath)
|
|
|
|
{
|
|
|
|
File dir = new File(dirPath);
|
|
|
|
if (!dir.exists() || !dir.isDirectory()) {
|
|
|
|
System.err.println("目录不存在或不是一个目录: " + dirPath);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
File[] files = dir.listFiles();
|
|
|
|
if (files == null || files.length == 0) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// 按照文件最后修改时间降序排序
|
|
|
|
Arrays.sort(files, Comparator.comparingLong(File::lastModified).reversed());
|
|
|
|
return files;
|
|
|
|
}
|
|
|
|
} |
...
|
...
|
|