package com.ruoyi.common.html;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.stream.Collectors;
/**
* 解析html
*/
public class AnalysisHTML {
public static String explainHTML() throws IOException {
String folderPath = "E:\\pc2";
// 获取文件夹下的所有文件
File[] files = new File(folderPath).listFiles();
// 遍历文件夹下的所有文件
int i = 0;
int b = 0;
a:
for (File file : files) {
// 获取文件名
String fileName = file.getName();
System.out.println(fileName);
// 获取文件内容
String fileContent = new String(Files.readAllBytes(file.toPath()));
Document parse = Jsoup.parse(fileContent);
String title = parse.title();
Element body = parse.body();
Elements elementsByClass = body.getElementsByClass("topic__type-body");
Elements children = elementsByClass.get(0).children();
for (Element element : children) {
int legend = element.select("legend").size();
File file1 = null;
if (legend == 0) {
//跳过该文件
file1 = new File("E:\\noexplain");
b++;
continue a;
}
if (!element.getElementsByTag("legend").attr("class").equals("topic__type-title")) {
//跳过该文件
file1 = new File("E:\\noexplain");
b++;
continue a;
}
}
for (Element element : children) {
i++;
//获取题目
String legend = element.getElementsByTag("legend").text();
System.out.println("题目: " + legend);
//将题目保存到表中
//判断该题目下是否有选项
Elements elementsByClass1 = element.getElementsByClass("topic__type-dry");
if (elementsByClass1.size() != 0) {
Elements spans = elementsByClass1.get(0).getElementsByTag("span");
String item = "";
for (Element span : spans) {
Elements labels = span.getElementsByTag("label");
for (Element label : labels) {
String text = label.text();
//将题目存库
item = item + "------" + text;
}
}
System.out.println("选项: " + item);
}
}
}
System.out.println(i + " " + b);
return null;
}
public static void main(String[] args) throws IOException {
explainHTML();
}
}