org.jsoup.parser.Parser类的使用及代码示例

x33g5p2x  于2022-01-26 转载在 其他  
字(14.5k)|赞(0)|评价(0)|浏览(217)

本文整理了Java中org.jsoup.parser.Parser类的一些代码示例,展示了Parser类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Parser类的具体详情如下:
包路径:org.jsoup.parser.Parser
类名称:Parser

Parser介绍

[英]Parses HTML into a org.jsoup.nodes.Document. Generally best to use one of the more convenient parse methods in org.jsoup.Jsoup.
[中]将HTML解析为组织。jsoup。节点。文件通常,最好使用org中更方便的解析方法之一。jsoup。Jsoup。

代码示例

代码示例来源:origin: deeplearning4j/dl4j-examples

Document document = Jsoup.parse(str, "", Parser.xmlParser());
String descr;
Elements patent = document.select("us-patent-grant");
if (patent.size() > 0) {
  Elements mainClassification = e2.select("main-classification");
  if (mainClassification == null || mainClassification.size() == 0) {
    log.warn("Skipping patent {} in document - no main classification");
    return null;
  String main = e2.select("main-classification").outerHtml().replaceAll("\n", "")
      .replaceAll("<main-classification>", "").replaceAll("</main-classification>", "")
      .replaceFirst(" ", ""); //Replace first space - not significant, always present. But SECOND space is important
  descr = patent.select("description").text();
} else {
  patent = document.select("PATDOC");
  if (patent.size() > 0) {
    title = patent.select("B540").first().text();
    abstr = patent.select("SDOAB").text();
    claims = patent.select("SDOCL").text();

代码示例来源:origin: org.jsoup/jsoup

/**
 * Loads a file to a Document.
 * @param in file to load
 * @param charsetName character set of input
 * @param baseUri base URI of document, to resolve relative links against
 * @return Document
 * @throws IOException on IO error
 */
public static Document load(File in, String charsetName, String baseUri) throws IOException {
  return parseInputStream(new FileInputStream(in), charsetName, baseUri, Parser.htmlParser());
}

代码示例来源:origin: org.jsoup/jsoup

static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException  {
  if (input == null) // empty body
    return new Document(baseUri);
  input = ConstrainableInputStream.wrap(input, bufferSize, 0);
    doc = parser.parseInput(docData, baseUri);
    Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
      if (meta.hasAttr("http-equiv"))
        foundCharset = getCharsetFromContentType(meta.attr("content"));
      if (foundCharset == null && meta.hasAttr("charset"))
        foundCharset = meta.attr("charset");
      if (foundCharset != null)
    if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) {
      XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
      if (prolog.name().equals("xml"))
      reader.skip(1);
    try {
      doc = parser.parseInput(reader, baseUri);
    } catch (UncheckedIOException e) {

代码示例来源:origin: USPTO/PatentPublicData

rawText = rawText.replaceAll("", "</q>");
Document jsoupDoc = Jsoup.parse("<body>" + rawText + "</body>", "", Parser.xmlParser());
jsoupDoc.outputSettings().prettyPrint(false).syntax(Syntax.xml).charset(StandardCharsets.UTF_16);
jsoupDoc.select("bold").tagName("b");
Elements figRefEls = jsoupDoc.select("FGREF");
for (int i = 1; i <= figRefEls.size(); i++) {
  Element element = figRefEls.get(i - 1);
  element.attr("id", "FR-" + Strings.padStart(String.valueOf(i), 4, '0'));
  element.attr("idref", ReferenceTagger.createFigId(element.select("PDAT").text()));
  element.tagName("a");
  element.addClass("figref");
  jsoupDoc = Jsoup.parse("<body>" + fieldTextCleaned + "</body>", "", Parser.xmlParser());
  jsoupDoc.outputSettings().prettyPrint(false).syntax(OutputSettings.Syntax.xml).charset(StandardCharsets.UTF_16);

代码示例来源:origin: opacapp/opacclient

static List<LentItem> parse_medialist(Document doc) {
  List<LentItem> media = new ArrayList<>();
  Elements copytrs = doc.select(".data tr");
    LentItem item = new LentItem();
    if (tr.text().contains("keine Daten")) {
      return null;
    item.setTitle(tr.select(".account-display-title").select("b, strong")
            .text().trim());
    try {
      item.setRenewable(false);
      if (tr.select("a").size() > 0) {
        for (Element link : tr.select("a")) {
          String href = link.attr("abs:href");
      if (lines.length == 4 || lines.length == 5) {
        item.setAuthor(Jsoup.parse(lines[1]).text().trim());
        item.setBarcode(Jsoup.parse(lines[2]).text().trim());
        if (lines.length == 5) {
        item.setBarcode(Parser.unescapeEntities(lines[1].trim(), false));
        item.setStatus(Parser.unescapeEntities(lines[2].trim(), false));
      } else if (lines.length == 2) {
        item.setAuthor(Parser.unescapeEntities(lines[1].trim(), false));

代码示例来源:origin: USPTO/PatentPublicData

Document jsoupDoc = Jsoup.parse("<body>" + rawText + "</body>", "", Parser.xmlParser());
jsoupDoc.outputSettings().prettyPrint(false).charset(StandardCharsets.UTF_16);
Elements figEls = jsoupDoc.select("a.figref");
for (int i = 1; i <= figEls.size(); i++) {
  Element element = figEls.get(i - 1);
  element.attr("id", "FR-" + Strings.padStart(String.valueOf(i), 4, '0'));
Elements headerEls = jsoupDoc.select("PAC");
for (int i = 1; i <= headerEls.size(); i++) {
  Element element = headerEls.get(i - 1);
  element.attr("id", "H-" + Strings.padStart(String.valueOf(i), 4, '0'));
  element.tagName("h2");

代码示例来源:origin: samczsun/Skype4J

@Override
  public void handle(SkypeImpl skype, JsonObject resource) throws ConnectionException, ChatNotFoundException, IOException {
    String content = Utils.getString(resource, "content");
    String chatId = Utils.getString(resource, "conversationLink");
    String author = getAuthor(resource);
    Validate.notNull(content, "Null content");
    Validate.notNull(chatId, "Null chat");
    Validate.notNull(author, "Null author");
    String username = getUsername(author);
    Validate.notNull(username, "Null username");
    Chat chat = getChat(chatId, skype);
    Validate.notNull(chat, "Null chatobj");
    Participant initiator = chat.getParticipant(username);
    Validate.notNull(initiator, "Null initiator");
    Document doc = Parser.xmlParser().parseInput(content, "");
    List<ReceivedFile> receivedFiles = doc
        .getElementsByTag("file")
        .stream()
        .map(fe -> new ReceivedFileImpl(fe.text(), Long.parseLong(fe.attr("size")),
            Long.parseLong(fe.attr("tid"))))
        .collect(Collectors.toList());
    FileReceivedEvent event = new FileReceivedEvent(chat, initiator, receivedFiles);
    skype.getEventDispatcher().callEvent(event);
  }
},

代码示例来源:origin: DigitalPebble/storm-crawler

.decode(ByteBuffer.wrap(content)).toString();
jsoupDoc = Parser.htmlParser().parseInput(html, url);
    .selectFirst("meta[name~=(?i)robots][content]");
if (robotelement != null) {
  robotsTags.extractMetaTags(robotelement.attr("content"));
  slinks = new HashMap<>(0);
} else {
  Elements links = jsoupDoc.select("a[href]");
  slinks = new HashMap<>(links.size());
  for (Element link : links) {
    String targetURL = link.attr("abs:href");
        .attr("rel"));
Element body = jsoupDoc.body();
if (body != null) {
  text = textExtractor.text(body);

代码示例来源:origin: de.unistuttgart.ims/de.unistuttgart.ims.drama.io.core

public static void getNext(JCas jcas, InputStream file, Drama drama, boolean strict)
    throws IOException, CollectionException {
  Document doc = Jsoup.parse(file, "UTF-8", "", Parser.xmlParser());
  drama.setDocumentTitle(doc.select("titleStmt > title").first().text());
  if (!doc.select("idno[type=\"TextGridUri\"]").isEmpty())
    drama.setDocumentId(doc.select("idno[type=\"TextGridUri\"]").first().text().substring(9));
    Element authorElement = authorElements.get(i);
    Author author = new Author(jcas);
    author.setName(authorElement.text());
    if (authorElement.hasAttr("key")) {
      author.setPnd(authorElement.attr("key").replace("pnd:", "http://d-nb.info/gnd/"));

代码示例来源:origin: USPTO/PatentPublicData

@Override
public String getPlainText(String rawText, FreetextConfig textConfig) {
  Document jsoupDoc = Jsoup.parse(rawText, "", Parser.xmlParser());
  for (Element paragraph : jsoupDoc.select("PARA")) {
    int level = paragraph.attr("LVL") != null ? Integer.valueOf(paragraph.attr("LVL")) : 0;
    StringBuilder stb = new StringBuilder();
    for (int i = 0; i <= level; i++) {
      stb.append("&nbsp;");
    }
    paragraph.prepend(stb.toString());
  }
  String simpleHtml = getSimpleHtml(jsoupDoc.outerHtml());
  Document simpleDoc = Jsoup.parse(simpleHtml, "", Parser.xmlParser());
  HtmlToPlainText htmlConvert = new HtmlToPlainText(textConfig);
  return htmlConvert.getPlainText(simpleDoc);
}

代码示例来源:origin: starlightknight/swagger-confluence

private static String reformatXHtml(final String inputXhtml, final Map<String, ConfluenceLink> confluenceLinkMap) {
  final Document document = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser());
  document.outputSettings().prettyPrint(false);
  document.outputSettings().escapeMode(xhtml);
  document.outputSettings().charset("UTF-8");
  final Elements linkElements = document.select("a");
    final String originalHref = linkElement.attr("href");
    final ConfluenceLink confluenceLink = confluenceLinkMap.get(originalHref);
    linkElement.before(confluenceLinkMarkup);
    linkElement.html("");
    linkElement.unwrap();

代码示例来源:origin: USPTO/PatentPublicData

@Override
  public List<String> getParagraphText(String rawText) {
    String textWithPMarks = getSimpleHtml(rawText);
    Document jsoupDoc = Jsoup.parse(textWithPMarks, "", Parser.xmlParser());

    List<String> paragraphs = new ArrayList<String>();
    for (Element element : jsoupDoc.select("p")) {
      paragraphs.add(element.html());
    }

    return paragraphs;
  }
}

代码示例来源:origin: DigitalPebble/storm-crawler

/**
 * Attempt to find a META tag in the HTML that hints at the character set
 * used to write the document.
 */
private static String getCharsetFromMeta(byte buffer[], int maxlength) {
  // convert to UTF-8 String -- which hopefully will not mess up the
  // characters we're interested in...
  int len = buffer.length;
  if (maxlength > 0 && maxlength < len) {
    len = maxlength;
  }
  String html = new String(buffer, 0, len, DEFAULT_CHARSET);
  Document doc = Parser.htmlParser().parseInput(html, "dummy");
  // look for <meta http-equiv="Content-Type"
  // content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
  Elements metaElements = doc
      .select("meta[http-equiv=content-type], meta[charset]");
  String foundCharset = null;
  for (Element meta : metaElements) {
    if (meta.hasAttr("http-equiv"))
      foundCharset = getCharsetFromContentType(meta.attr("content"));
    if (foundCharset == null && meta.hasAttr("charset"))
      foundCharset = meta.attr("charset");
    if (foundCharset != null)
      return foundCharset;
  }
  return foundCharset;
}

代码示例来源:origin: crazyhitty/Munch

@Override
protected String doInBackground(String... strings) {
  Document opmlDocument = null;
  try {
    if (mUrl != null) {
      opmlDocument = Jsoup.connect(mUrl).parser(Parser.xmlParser()).get();
    } else {
      opmlDocument = Jsoup.parse(mFile, "UTF-8");
    }
  } catch (IOException e) {
    e.printStackTrace();
    return e.getMessage();
  }
  if (opmlDocument != null) {
    mOpmlItems = opmlDocument.select("outline");
  }
  return "success";
}

代码示例来源:origin: TeamNewPipe/NewPipeExtractor

private List<SubscriptionItem> getItemsFromOPML(InputStream contentInputStream) throws ExtractionException {
  final List<SubscriptionItem> result = new ArrayList<>();
  final String contentString = readFromInputStream(contentInputStream);
  Document document = Jsoup.parse(contentString, "", org.jsoup.parser.Parser.xmlParser());
  if (document.select("opml").isEmpty()) {
    throw new InvalidSourceException("document does not have OPML tag");
  }
  if (document.select("outline").isEmpty()) {
    throw new InvalidSourceException("document does not have at least one outline tag");
  }
  for (Element outline : document.select("outline[type=rss]")) {
    String title = outline.attr("title");
    String xmlUrl = outline.attr("abs:xmlUrl");
    if (title.isEmpty() || xmlUrl.isEmpty()) {
      throw new InvalidSourceException("document has invalid entries");
    }
    try {
      String id = Parser.matchGroup1(ID_PATTERN, xmlUrl);
      result.add(new SubscriptionItem(service.getServiceId(), BASE_CHANNEL_URL + id, title));
    } catch (Parser.RegexException e) {
      throw new InvalidSourceException("document has invalid entries", e);
    }
  }
  return result;
}

代码示例来源:origin: addthis/hydra

Parser parser = Parser.htmlParser().setTrackErrors(0);
@Nonnull Document doc = parser.parseInput(html, "");
@Nonnull Elements tags = doc.select(tagName);
    @Nonnull String attrValue = tag.attr(tagAttr).toLowerCase();
    for (String matchValue : values) {
      if (attrValue.contains(matchValue)) {

代码示例来源:origin: org.tinymediamanager.plugins/scraper-anidb

trackConnections();
 doc = Jsoup.parse(cachedUrl.getInputStream(), "UTF-8", "", Parser.xmlParser());
if (doc == null || doc.children().size() == 0) {
 return md;
Element anime = doc.child(0);
for (Element e : anime.children()) {
 if ("startdate".equalsIgnoreCase(e.tagName())) {
  try {
   Date date = StrgUtils.parseDate(e.text());
   md.setReleaseDate(date);

代码示例来源:origin: abc9070410/JComicDownloader

org.jsoup.nodes.Document doc = org.jsoup.Jsoup.connect(urlString.replaceFirst("[.]com[/]manhua-", ".com/rss-")).cookie("Cookie", "isAdult=1").parser(org.jsoup.parser.Parser.xmlParser()).get();
this.title = Common.getStringRemovedIllegalChar(NewEncoding.StoT(doc.getElementsByTag("title").get(0).text()));
for  (org.jsoup.nodes.Element e : doc.getElementsByTag("item")){
  volumeList.add( getVolumeWithFormatNumber( Common.getStringRemovedIllegalChar(
      NewEncoding.StoT(e.getElementsByTag("title").get(0).text().trim()))));
  urlList.add( e.getElementsByTag("link").get(0).text());

代码示例来源:origin: de.unistuttgart.ims/uimautil

public JCas read(JCas jcas, InputStream xmlStream) throws IOException {
  doc = Jsoup.parse(xmlStream, "UTF-8", "", Parser.xmlParser());
    root = doc;
  else
    root = doc.select(textRootSelector).first();
  root.traverse(vis);
  parsingDescription.setEncoding(doc.charset().name());
  Node rootNode = doc.root();
  List<String> declarations = new LinkedList<String>();
  for (Node topNode : rootNode.childNodes()) {

代码示例来源:origin: org.apache.any23/apache-any23-core

if (length >= 20 && bytes[length - 2] == '?') {
                String decl = "<" + new String(bytes, 2, length - 4) + ">";
                org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI, Parser.xmlParser());
                for (org.jsoup.nodes.Element el : doc.children()) {
                  if ("xml".equalsIgnoreCase(el.tagName())) {
                    String enc = el.attr("encoding");
                    if (enc != null && !enc.isEmpty()) {
                      encoding = enc;
return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());

相关文章