本文整理了Java中org.jsoup.parser.Parser
类的一些代码示例,展示了Parser
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Parser
类的具体详情如下:
包路径:org.jsoup.parser.Parser
类名称:Parser
[英]Parses HTML into a org.jsoup.nodes.Document. Generally best to use one of the more convenient parse methods in org.jsoup.Jsoup.
[中]将HTML解析为组织。jsoup。节点。文件通常,最好使用org中更方便的解析方法之一。jsoup。Jsoup。
代码示例来源:origin: deeplearning4j/dl4j-examples
Document document = Jsoup.parse(str, "", Parser.xmlParser());
String descr;
Elements patent = document.select("us-patent-grant");
if (patent.size() > 0) {
Elements mainClassification = e2.select("main-classification");
if (mainClassification == null || mainClassification.size() == 0) {
log.warn("Skipping patent {} in document - no main classification");
return null;
String main = e2.select("main-classification").outerHtml().replaceAll("\n", "")
.replaceAll("<main-classification>", "").replaceAll("</main-classification>", "")
.replaceFirst(" ", ""); //Replace first space - not significant, always present. But SECOND space is important
descr = patent.select("description").text();
} else {
patent = document.select("PATDOC");
if (patent.size() > 0) {
title = patent.select("B540").first().text();
abstr = patent.select("SDOAB").text();
claims = patent.select("SDOCL").text();
代码示例来源:origin: org.jsoup/jsoup
/**
* Loads a file to a Document.
* @param in file to load
* @param charsetName character set of input
* @param baseUri base URI of document, to resolve relative links against
* @return Document
* @throws IOException on IO error
*/
public static Document load(File in, String charsetName, String baseUri) throws IOException {
return parseInputStream(new FileInputStream(in), charsetName, baseUri, Parser.htmlParser());
}
代码示例来源:origin: org.jsoup/jsoup
static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException {
if (input == null) // empty body
return new Document(baseUri);
input = ConstrainableInputStream.wrap(input, bufferSize, 0);
doc = parser.parseInput(docData, baseUri);
Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
if (meta.hasAttr("http-equiv"))
foundCharset = getCharsetFromContentType(meta.attr("content"));
if (foundCharset == null && meta.hasAttr("charset"))
foundCharset = meta.attr("charset");
if (foundCharset != null)
if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) {
XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
if (prolog.name().equals("xml"))
reader.skip(1);
try {
doc = parser.parseInput(reader, baseUri);
} catch (UncheckedIOException e) {
代码示例来源:origin: USPTO/PatentPublicData
rawText = rawText.replaceAll("", "</q>");
Document jsoupDoc = Jsoup.parse("<body>" + rawText + "</body>", "", Parser.xmlParser());
jsoupDoc.outputSettings().prettyPrint(false).syntax(Syntax.xml).charset(StandardCharsets.UTF_16);
jsoupDoc.select("bold").tagName("b");
Elements figRefEls = jsoupDoc.select("FGREF");
for (int i = 1; i <= figRefEls.size(); i++) {
Element element = figRefEls.get(i - 1);
element.attr("id", "FR-" + Strings.padStart(String.valueOf(i), 4, '0'));
element.attr("idref", ReferenceTagger.createFigId(element.select("PDAT").text()));
element.tagName("a");
element.addClass("figref");
jsoupDoc = Jsoup.parse("<body>" + fieldTextCleaned + "</body>", "", Parser.xmlParser());
jsoupDoc.outputSettings().prettyPrint(false).syntax(OutputSettings.Syntax.xml).charset(StandardCharsets.UTF_16);
代码示例来源:origin: opacapp/opacclient
static List<LentItem> parse_medialist(Document doc) {
List<LentItem> media = new ArrayList<>();
Elements copytrs = doc.select(".data tr");
LentItem item = new LentItem();
if (tr.text().contains("keine Daten")) {
return null;
item.setTitle(tr.select(".account-display-title").select("b, strong")
.text().trim());
try {
item.setRenewable(false);
if (tr.select("a").size() > 0) {
for (Element link : tr.select("a")) {
String href = link.attr("abs:href");
if (lines.length == 4 || lines.length == 5) {
item.setAuthor(Jsoup.parse(lines[1]).text().trim());
item.setBarcode(Jsoup.parse(lines[2]).text().trim());
if (lines.length == 5) {
item.setBarcode(Parser.unescapeEntities(lines[1].trim(), false));
item.setStatus(Parser.unescapeEntities(lines[2].trim(), false));
} else if (lines.length == 2) {
item.setAuthor(Parser.unescapeEntities(lines[1].trim(), false));
代码示例来源:origin: USPTO/PatentPublicData
Document jsoupDoc = Jsoup.parse("<body>" + rawText + "</body>", "", Parser.xmlParser());
jsoupDoc.outputSettings().prettyPrint(false).charset(StandardCharsets.UTF_16);
Elements figEls = jsoupDoc.select("a.figref");
for (int i = 1; i <= figEls.size(); i++) {
Element element = figEls.get(i - 1);
element.attr("id", "FR-" + Strings.padStart(String.valueOf(i), 4, '0'));
Elements headerEls = jsoupDoc.select("PAC");
for (int i = 1; i <= headerEls.size(); i++) {
Element element = headerEls.get(i - 1);
element.attr("id", "H-" + Strings.padStart(String.valueOf(i), 4, '0'));
element.tagName("h2");
代码示例来源:origin: samczsun/Skype4J
@Override
public void handle(SkypeImpl skype, JsonObject resource) throws ConnectionException, ChatNotFoundException, IOException {
String content = Utils.getString(resource, "content");
String chatId = Utils.getString(resource, "conversationLink");
String author = getAuthor(resource);
Validate.notNull(content, "Null content");
Validate.notNull(chatId, "Null chat");
Validate.notNull(author, "Null author");
String username = getUsername(author);
Validate.notNull(username, "Null username");
Chat chat = getChat(chatId, skype);
Validate.notNull(chat, "Null chatobj");
Participant initiator = chat.getParticipant(username);
Validate.notNull(initiator, "Null initiator");
Document doc = Parser.xmlParser().parseInput(content, "");
List<ReceivedFile> receivedFiles = doc
.getElementsByTag("file")
.stream()
.map(fe -> new ReceivedFileImpl(fe.text(), Long.parseLong(fe.attr("size")),
Long.parseLong(fe.attr("tid"))))
.collect(Collectors.toList());
FileReceivedEvent event = new FileReceivedEvent(chat, initiator, receivedFiles);
skype.getEventDispatcher().callEvent(event);
}
},
代码示例来源:origin: DigitalPebble/storm-crawler
.decode(ByteBuffer.wrap(content)).toString();
jsoupDoc = Parser.htmlParser().parseInput(html, url);
.selectFirst("meta[name~=(?i)robots][content]");
if (robotelement != null) {
robotsTags.extractMetaTags(robotelement.attr("content"));
slinks = new HashMap<>(0);
} else {
Elements links = jsoupDoc.select("a[href]");
slinks = new HashMap<>(links.size());
for (Element link : links) {
String targetURL = link.attr("abs:href");
.attr("rel"));
Element body = jsoupDoc.body();
if (body != null) {
text = textExtractor.text(body);
代码示例来源:origin: de.unistuttgart.ims/de.unistuttgart.ims.drama.io.core
public static void getNext(JCas jcas, InputStream file, Drama drama, boolean strict)
throws IOException, CollectionException {
Document doc = Jsoup.parse(file, "UTF-8", "", Parser.xmlParser());
drama.setDocumentTitle(doc.select("titleStmt > title").first().text());
if (!doc.select("idno[type=\"TextGridUri\"]").isEmpty())
drama.setDocumentId(doc.select("idno[type=\"TextGridUri\"]").first().text().substring(9));
Element authorElement = authorElements.get(i);
Author author = new Author(jcas);
author.setName(authorElement.text());
if (authorElement.hasAttr("key")) {
author.setPnd(authorElement.attr("key").replace("pnd:", "http://d-nb.info/gnd/"));
代码示例来源:origin: USPTO/PatentPublicData
@Override
public String getPlainText(String rawText, FreetextConfig textConfig) {
Document jsoupDoc = Jsoup.parse(rawText, "", Parser.xmlParser());
for (Element paragraph : jsoupDoc.select("PARA")) {
int level = paragraph.attr("LVL") != null ? Integer.valueOf(paragraph.attr("LVL")) : 0;
StringBuilder stb = new StringBuilder();
for (int i = 0; i <= level; i++) {
stb.append(" ");
}
paragraph.prepend(stb.toString());
}
String simpleHtml = getSimpleHtml(jsoupDoc.outerHtml());
Document simpleDoc = Jsoup.parse(simpleHtml, "", Parser.xmlParser());
HtmlToPlainText htmlConvert = new HtmlToPlainText(textConfig);
return htmlConvert.getPlainText(simpleDoc);
}
代码示例来源:origin: starlightknight/swagger-confluence
private static String reformatXHtml(final String inputXhtml, final Map<String, ConfluenceLink> confluenceLinkMap) {
final Document document = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser());
document.outputSettings().prettyPrint(false);
document.outputSettings().escapeMode(xhtml);
document.outputSettings().charset("UTF-8");
final Elements linkElements = document.select("a");
final String originalHref = linkElement.attr("href");
final ConfluenceLink confluenceLink = confluenceLinkMap.get(originalHref);
linkElement.before(confluenceLinkMarkup);
linkElement.html("");
linkElement.unwrap();
代码示例来源:origin: USPTO/PatentPublicData
@Override
public List<String> getParagraphText(String rawText) {
String textWithPMarks = getSimpleHtml(rawText);
Document jsoupDoc = Jsoup.parse(textWithPMarks, "", Parser.xmlParser());
List<String> paragraphs = new ArrayList<String>();
for (Element element : jsoupDoc.select("p")) {
paragraphs.add(element.html());
}
return paragraphs;
}
}
代码示例来源:origin: DigitalPebble/storm-crawler
/**
* Attempt to find a META tag in the HTML that hints at the character set
* used to write the document.
*/
private static String getCharsetFromMeta(byte buffer[], int maxlength) {
// convert to UTF-8 String -- which hopefully will not mess up the
// characters we're interested in...
int len = buffer.length;
if (maxlength > 0 && maxlength < len) {
len = maxlength;
}
String html = new String(buffer, 0, len, DEFAULT_CHARSET);
Document doc = Parser.htmlParser().parseInput(html, "dummy");
// look for <meta http-equiv="Content-Type"
// content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
Elements metaElements = doc
.select("meta[http-equiv=content-type], meta[charset]");
String foundCharset = null;
for (Element meta : metaElements) {
if (meta.hasAttr("http-equiv"))
foundCharset = getCharsetFromContentType(meta.attr("content"));
if (foundCharset == null && meta.hasAttr("charset"))
foundCharset = meta.attr("charset");
if (foundCharset != null)
return foundCharset;
}
return foundCharset;
}
代码示例来源:origin: crazyhitty/Munch
@Override
protected String doInBackground(String... strings) {
Document opmlDocument = null;
try {
if (mUrl != null) {
opmlDocument = Jsoup.connect(mUrl).parser(Parser.xmlParser()).get();
} else {
opmlDocument = Jsoup.parse(mFile, "UTF-8");
}
} catch (IOException e) {
e.printStackTrace();
return e.getMessage();
}
if (opmlDocument != null) {
mOpmlItems = opmlDocument.select("outline");
}
return "success";
}
代码示例来源:origin: TeamNewPipe/NewPipeExtractor
private List<SubscriptionItem> getItemsFromOPML(InputStream contentInputStream) throws ExtractionException {
final List<SubscriptionItem> result = new ArrayList<>();
final String contentString = readFromInputStream(contentInputStream);
Document document = Jsoup.parse(contentString, "", org.jsoup.parser.Parser.xmlParser());
if (document.select("opml").isEmpty()) {
throw new InvalidSourceException("document does not have OPML tag");
}
if (document.select("outline").isEmpty()) {
throw new InvalidSourceException("document does not have at least one outline tag");
}
for (Element outline : document.select("outline[type=rss]")) {
String title = outline.attr("title");
String xmlUrl = outline.attr("abs:xmlUrl");
if (title.isEmpty() || xmlUrl.isEmpty()) {
throw new InvalidSourceException("document has invalid entries");
}
try {
String id = Parser.matchGroup1(ID_PATTERN, xmlUrl);
result.add(new SubscriptionItem(service.getServiceId(), BASE_CHANNEL_URL + id, title));
} catch (Parser.RegexException e) {
throw new InvalidSourceException("document has invalid entries", e);
}
}
return result;
}
代码示例来源:origin: addthis/hydra
Parser parser = Parser.htmlParser().setTrackErrors(0);
@Nonnull Document doc = parser.parseInput(html, "");
@Nonnull Elements tags = doc.select(tagName);
@Nonnull String attrValue = tag.attr(tagAttr).toLowerCase();
for (String matchValue : values) {
if (attrValue.contains(matchValue)) {
代码示例来源:origin: org.tinymediamanager.plugins/scraper-anidb
trackConnections();
doc = Jsoup.parse(cachedUrl.getInputStream(), "UTF-8", "", Parser.xmlParser());
if (doc == null || doc.children().size() == 0) {
return md;
Element anime = doc.child(0);
for (Element e : anime.children()) {
if ("startdate".equalsIgnoreCase(e.tagName())) {
try {
Date date = StrgUtils.parseDate(e.text());
md.setReleaseDate(date);
代码示例来源:origin: abc9070410/JComicDownloader
org.jsoup.nodes.Document doc = org.jsoup.Jsoup.connect(urlString.replaceFirst("[.]com[/]manhua-", ".com/rss-")).cookie("Cookie", "isAdult=1").parser(org.jsoup.parser.Parser.xmlParser()).get();
this.title = Common.getStringRemovedIllegalChar(NewEncoding.StoT(doc.getElementsByTag("title").get(0).text()));
for (org.jsoup.nodes.Element e : doc.getElementsByTag("item")){
volumeList.add( getVolumeWithFormatNumber( Common.getStringRemovedIllegalChar(
NewEncoding.StoT(e.getElementsByTag("title").get(0).text().trim()))));
urlList.add( e.getElementsByTag("link").get(0).text());
代码示例来源:origin: de.unistuttgart.ims/uimautil
public JCas read(JCas jcas, InputStream xmlStream) throws IOException {
doc = Jsoup.parse(xmlStream, "UTF-8", "", Parser.xmlParser());
root = doc;
else
root = doc.select(textRootSelector).first();
root.traverse(vis);
parsingDescription.setEncoding(doc.charset().name());
Node rootNode = doc.root();
List<String> declarations = new LinkedList<String>();
for (Node topNode : rootNode.childNodes()) {
代码示例来源:origin: org.apache.any23/apache-any23-core
if (length >= 20 && bytes[length - 2] == '?') {
String decl = "<" + new String(bytes, 2, length - 4) + ">";
org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI, Parser.xmlParser());
for (org.jsoup.nodes.Element el : doc.children()) {
if ("xml".equalsIgnoreCase(el.tagName())) {
String enc = el.attr("encoding");
if (enc != null && !enc.isEmpty()) {
encoding = enc;
return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());
内容来源于网络,如有侵权,请联系作者删除!