package com.ruoyi.common.html;
|
|
import org.jsoup.Jsoup;
|
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Element;
|
import org.jsoup.select.Elements;
|
|
import java.io.File;
|
import java.io.IOException;
|
import java.nio.file.Files;
|
import java.util.stream.Collectors;
|
|
/**
|
* 解析html
|
*/
|
public class AnalysisHTML {
|
|
public static String explainHTML() throws IOException {
|
String folderPath = "E:\\pc2";
|
// 获取文件夹下的所有文件
|
File[] files = new File(folderPath).listFiles();
|
// 遍历文件夹下的所有文件
|
int i = 0;
|
int b = 0;
|
a:
|
for (File file : files) {
|
// 获取文件名
|
String fileName = file.getName();
|
System.out.println(fileName);
|
// 获取文件内容
|
String fileContent = new String(Files.readAllBytes(file.toPath()));
|
|
Document parse = Jsoup.parse(fileContent);
|
|
String title = parse.title();
|
Element body = parse.body();
|
Elements elementsByClass = body.getElementsByClass("topic__type-body");
|
Elements children = elementsByClass.get(0).children();
|
for (Element element : children) {
|
int legend = element.select("legend").size();
|
File file1 = null;
|
if (legend == 0) {
|
//跳过该文件
|
file1 = new File("E:\\noexplain");
|
b++;
|
continue a;
|
}
|
if (!element.getElementsByTag("legend").attr("class").equals("topic__type-title")) {
|
//跳过该文件
|
file1 = new File("E:\\noexplain");
|
b++;
|
continue a;
|
}
|
}
|
|
for (Element element : children) {
|
i++;
|
//获取题目
|
String legend = element.getElementsByTag("legend").text();
|
System.out.println("题目: " + legend);
|
//将题目保存到表中
|
|
//判断该题目下是否有选项
|
Elements elementsByClass1 = element.getElementsByClass("topic__type-dry");
|
if (elementsByClass1.size() != 0) {
|
Elements spans = elementsByClass1.get(0).getElementsByTag("span");
|
String item = "";
|
for (Element span : spans) {
|
Elements labels = span.getElementsByTag("label");
|
for (Element label : labels) {
|
String text = label.text();
|
//将题目存库
|
item = item + "------" + text;
|
}
|
}
|
System.out.println("选项: " + item);
|
}
|
|
}
|
|
}
|
System.out.println(i + " " + b);
|
|
return null;
|
}
|
|
public static void main(String[] args) throws IOException {
|
explainHTML();
|
}
|
}
|