package com.ruoyi.common.html; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.stream.Collectors; /** * 解析html */ public class AnalysisHTML { public static String explainHTML() throws IOException { String folderPath = "E:\\pc2"; // 获取文件夹下的所有文件 File[] files = new File(folderPath).listFiles(); // 遍历文件夹下的所有文件 int i = 0; int b = 0; a: for (File file : files) { // 获取文件名 String fileName = file.getName(); System.out.println(fileName); // 获取文件内容 String fileContent = new String(Files.readAllBytes(file.toPath())); Document parse = Jsoup.parse(fileContent); String title = parse.title(); Element body = parse.body(); Elements elementsByClass = body.getElementsByClass("topic__type-body"); Elements children = elementsByClass.get(0).children(); for (Element element : children) { int legend = element.select("legend").size(); File file1 = null; if (legend == 0) { //跳过该文件 file1 = new File("E:\\noexplain"); b++; continue a; } if (!element.getElementsByTag("legend").attr("class").equals("topic__type-title")) { //跳过该文件 file1 = new File("E:\\noexplain"); b++; continue a; } } for (Element element : children) { i++; //获取题目 String legend = element.getElementsByTag("legend").text(); System.out.println("题目: " + legend); //将题目保存到表中 //判断该题目下是否有选项 Elements elementsByClass1 = element.getElementsByClass("topic__type-dry"); if (elementsByClass1.size() != 0) { Elements spans = elementsByClass1.get(0).getElementsByTag("span"); String item = ""; for (Element span : spans) { Elements labels = span.getElementsByTag("label"); for (Element label : labels) { String text = label.text(); //将题目存库 item = item + "------" + text; } } System.out.println("选项: " + item); } } } System.out.println(i + " " + b); return null; } public static void main(String[] args) throws IOException { explainHTML(); } }