org.jsoup.Jsoup类的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(13.3k)|赞(0)|评价(0)|浏览(278)

本文整理了Java中org.jsoup.Jsoup类的一些代码示例,展示了Jsoup类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Jsoup类的具体详情如下:
包路径:org.jsoup.Jsoup
类名称:Jsoup

Jsoup介绍

[英]The core public access point to the jsoup functionality.
[中]

代码示例

代码示例来源:origin: loklak/loklak_server

/**
 * This is a helper function that helps user to extract html nested inside of html script
 * @param raw_html
 * @return nested html String
 */
private static String getNestedHtml(String raw_html){
  String html = raw_html.replace("\\","");
  Document doc = Jsoup.parse(html);
  //get the script tag
  Elements scripts = doc.getElementsByTag("script");
  //pattern for extracting html
  Pattern pttrn = Pattern.compile("\"html\":\"");
  String nested_html = "";
  for (Element script:scripts){
    Matcher m =  pttrn.matcher(html = script.html());
    if(m.find()){
      nested_html += html.substring(m.end(), html.length() -3);
    }
  }
  return nested_html;
  }

代码示例来源:origin: ChinaSilence/any-video

private String getOpenId(String accessToken) throws IOException{
  String url = openIdUri + accessToken;
  Document document = Jsoup.connect(url).get();
  String resultText = document.text();
  Matcher matcher = Pattern.compile("\"openid\":\"(.*?)\"").matcher(resultText);
  if (matcher.find()){
    return matcher.group(1);
  }
  return null;
}

代码示例来源:origin: JpressProjects/jpress

public static String getFirstImageSrc(String html) {
  if (StrUtils.isBlank(html))
    return null;
  Elements es = Jsoup.parseBodyFragment(html).select("img");
  if (es != null && es.size() > 0) {
    String src = es.first().attr("src");
    return StrUtils.isBlank(src) ? null : src;
  }
  return null;
}

代码示例来源:origin: RipMeApp/ripme

if ((url.getHost().endsWith("imgur.com"))
    && url.toExternalForm().contains("imgur.com/a/")) {
  try {
    logger.debug("Fetching imgur album at " + url);
else if (url.getHost().endsWith("imgur.com") && url.toExternalForm().contains(",")) {
Pattern p = Pattern.compile("https?://i.reddituploads.com/([a-zA-Z0-9]+)\\?.*");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
  logger.info("URL: " + url.toExternalForm());
  String u = url.toExternalForm().replaceAll("&", "&");
  try {
    Document doc = Jsoup.connect(url.toExternalForm())
        .userAgent(AbstractRipper.USER_AGENT)
        .get();
    for (Element el : doc.select("meta")) {
      if (el.attr("property").equals("og:video")) {
        result.add(new URL(el.attr("content")));
        return result;
      else if (el.attr("name").equals("twitter:image:src")) {
        result.add(new URL(el.attr("content")));
        return result;

代码示例来源:origin: RipMeApp/ripme

Pattern p;
Matcher m;
p = Pattern.compile(IMAGE_PATTERN);
Pattern qualP = Pattern.compile("_[0-9]+\\.(jpg|png|gif|bmp)$");
Matcher qualM;
        m = p.matcher(fileURL.toString());
        if (m.matches()) {
          downloadURL(fileURL, date);
        } else {
      fileURL = new URL(post.getString("video_url").replaceAll("http:", "https:"));
      downloadURL(fileURL, date);
    } catch (Exception e) {
    Document d = Jsoup.parse(post.getString("body"));
    if (!d.select("img").attr("src").isEmpty()) {
      try {
        String imgSrc = d.select("img").attr("src");
        qualM = qualP.matcher(imgSrc);
        imgSrc = qualM.replaceFirst("_1280.$1");
        downloadURL(new URL(imgSrc), date);
      } catch (MalformedURLException e) {

代码示例来源:origin: loklak/loklak_server

Post githubProfile = new GithubPost(profile, 0);
try {
  html = Jsoup.parse(bufferedReaderToString(br));
} catch (IOException e) {
  DAO.trace(e);
String avatarUrl = html.getElementsByAttributeValueContaining("class", "avatar").attr("src");
Pattern avatarUrlToUserId = Pattern.compile(".com\\/u\\/([0-9]+)\\?");
Matcher m = avatarUrlToUserId.matcher(avatarUrl);
m.find();
userId = m.group(1);
githubProfile.put("user_id", userId);
githubProfile.put("post_type", "user");
githubProfile.put("avatar_url", "https://avatars0.githubusercontent.com/u/" + userId);
String email = html.getElementsByAttributeValueContaining("itemprop", "email").text();
if (!email.contains("@")) {
  email = "";
String specialLink = html.getElementsByAttributeValueContaining("itemprop", "url").text();
githubProfile.put("special_link", specialLink);
Elements joiningDates = html.getElementsByAttributeValueContaining("class", "dropdown-item");
for (Element joiningDate: joiningDates) {
  String joinDate = joiningDate.attr("href");
  if (joinDate.contains("join")) {
    joinDate = joinDate.substring(joinDate.length() - 10);

代码示例来源:origin: TEAMMATES/teammates

@Test
public void testTimezoneDatabasesAreUpToDate() {
  // ensure the timezone databases are up-to-date
  String currentTzVersion = Jsoup.parse(browser.driver.getPageSource()).getElementById("tzversion-moment").text();
  browser.driver.get(IANA_TIMEZONE_DATABASE_URL);
  Document tzReleasePage = Jsoup.parse(browser.driver.getPageSource());
  String latestTzVersion = tzReleasePage.getElementById("version").text();
  if (!currentTzVersion.equals(latestTzVersion)) {
    // find the release day
    String releaseDateString = tzReleasePage.getElementById("date").text();
    Pattern datePattern = Pattern.compile("\\(Released (.+)\\)");
    Matcher matcher = datePattern.matcher(releaseDateString);
    assertTrue(matcher.find());
    LocalDate releaseDate = LocalDate.parse(matcher.group(1), DateTimeFormatter.ofPattern("yyyy-MM-dd"));
    LocalDate nowDate = Instant.now().atZone(Const.DEFAULT_TIME_ZONE).toLocalDate();
    assertTrue(
        "The timezone database version is not up-to-date for more than " + DAYS_TO_UPDATE_TZ + " days,"
            + " please update them according to the maintenance guide.",
        releaseDate.plusDays(DAYS_TO_UPDATE_TZ).isAfter(nowDate));
  }
}

代码示例来源:origin: huxq17/SwipeCardsView

private int getCount(String html) {
  Document doc = Jsoup.parse(html);
  Elements pages = doc.select("span");
  Element page = pages.get(10);
  Pattern p = Pattern.compile("[\\d*]");
  Matcher m = p.matcher(page.toString());
  StringBuffer stringBuffer = new StringBuffer();
  while (m.find()) {
    stringBuffer.append(m.group());
  }
  return Integer.parseInt(stringBuffer.toString());
}

代码示例来源:origin: jeremylong/DependencyCheck

if (HTML_DETECTION_PATTERN.matcher(description).find()) {
  desc = Jsoup.parse(description).text();
} else {
  desc = description;

代码示例来源:origin: iMeiji/Toutiao

private Boolean parseHTML(String HTML) {
  boolean flag = false;
  Document doc = Jsoup.parse(HTML);
  Elements scripts = doc.getElementsByTag("script");
  for (Element e : scripts) {
    String script = e.toString();
    if (script.contains("BASE_DATA.galleryInfo")) {
      script = e.childNode(0).toString();
      Matcher matcher = Pattern.compile("(JSON.parse\\(\\\".+\\))").matcher(script);
      while (matcher.find()) {
        int count = matcher.groupCount();
        if (count >= 1) {
          int start = script.indexOf("(");

代码示例来源:origin: RipMeApp/ripme

public static ImgurAlbum getImgurAlbum(URL url) throws IOException {
  String strUrl = url.toExternalForm();
  if (!strUrl.contains(",")) {
    strUrl += "/all";
  String newUrl = url.toExternalForm() + "/noscript";
  LOGGER.info("    Retrieving " + newUrl);
  doc = Jsoup.connect(newUrl)
            .userAgent(USER_AGENT)
            .get();
  for (Element thumb : doc.select("div.image")) {
    String image;
    if (!thumb.select("a.zoom").isEmpty()) {
      image = "http:" + thumb.select("a").attr("href");
    } else if (!thumb.select("img").isEmpty()) {
      image = "http:" + thumb.select("img").attr("src");
    } else {
      image = image.replace(".gif", ".mp4");
    ImgurImage imgurImage = new ImgurImage(new URL(image));
    imgurAlbum.addImage(imgurImage);

代码示例来源:origin: mygithuball/any-video

private boolean related(String url) {
  try {
    Document document = Jsoup.connect(url).get();
    String reg = String.format(FRIEND_LINK_HTML_REG, appDomain, appName);
    String html = document.html();
    Matcher matcher = Pattern.compile(reg).matcher(html);
    return matcher.find();
  } catch (IOException e) {
    log.info("Add Friend Link Error, url:" + url);
    e.printStackTrace();
  }
  return false;
}

代码示例来源:origin: loklak/loklak_server

/**
 * Method to match the given pattern with extracted elements of html page
 * and parse the result for the posts on the given instagram page
 * @return instaProfile as a JSONArray object containing all posts and details of viewer
 */
public JSONArray scrapeInstagram(BufferedReader br, String url) {
  Document htmlPage = null;
  Post instaObj = null;
  JSONArray instaProfile = new JSONArray();
  try {
    htmlPage = Jsoup.parse(this.bufferedReaderToString(br));
  } catch (IOException e) {
    DAO.trace(e);
  }
  String script = htmlPage.getElementsByTag("script").html();
  Matcher m = instaJsonData.matcher(script);
  m.find();
  int start = m.start(1);
  int end = m.start(2) + 1;
  script = script.substring(start, end);
  //TODO: pre-process the posts captured. At present, complete array of posts are output.
  //Only useful data shall be outputted.
  instaObj = new Post(script, this.query);
  instaProfile.put(instaObj);
  return instaProfile;
}

代码示例来源:origin: bonigarcia/webdrivermanager

String driverStr = driverUrl.toString();
String driverUrlContent = driverUrl.getPath();
  org.jsoup.nodes.Document doc = Jsoup.parse(in, null, "");
  Iterator<org.jsoup.nodes.Element> iterator = doc.select("a")
      .iterator();
  List<URL> urlList = new ArrayList<>();
    String link = iterator.next().attr("href");
    if (link.contains("mirror") && link.endsWith(SLASH)) {
      urlList.addAll(getDriversFromMirror(new URL(
          driverStr + link.replace(driverUrlContent, ""))));
    } else if (link.startsWith(driverUrlContent)

代码示例来源:origin: HubSpot/jinjava

@Override
public Object filter(Object object, JinjavaInterpreter interpreter, String... arg) {
 if (!(object instanceof String)) {
  return object;
 }
 String val = interpreter.renderFlat((String) object);
 String strippedVal = Jsoup.parseBodyFragment(val).text();
 String normalizedVal = WHITESPACE.matcher(strippedVal).replaceAll(" ");
 return normalizedVal;
}

代码示例来源:origin: magefree/mage

if (proxyType == ProxyType.NONE) {
  urlDocument = pageUrl;
  doc = Jsoup.connect(urlDocument).get();
} else {
  String proxyServer = prefs.get("proxyAddress", "");
  int proxyPort = Integer.parseInt(prefs.get("proxyPort", "0"));
  URL url = new URL(pageUrl);
  Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyServer, proxyPort));
  HttpURLConnection uc = (HttpURLConnection) url.openConnection(proxy);
    tmp.append(line);
  doc = Jsoup.parse(String.valueOf(tmp));
Elements cardsImages = doc.select("img[src^=cards/]"); // starts with cards/
if (!aliasesStart.isEmpty()) {
  for (String text : aliasesStart) {
    cardsImages.addAll(doc.select("img[src^=" + text + ']'));
  String cardLink = cardsImage.attr("src");
  String cardName = null;
  if (cardLink.startsWith("cards/") && cardLink.endsWith(".jpg")) {

代码示例来源:origin: magefree/mage

public static Document downloadHtmlDocument(String urlString) throws NumberFormatException, IOException {
    Preferences prefs = MageFrame.getPreferences();
    Connection.ProxyType proxyType = Connection.ProxyType.valueByText(prefs.get("proxyType", "None"));
    Document doc;
    if (proxyType == ProxyType.NONE) {
      doc = Jsoup.connect(urlString).timeout(60 * 1000).get();
    } else {
      String proxyServer = prefs.get("proxyAddress", "");
      int proxyPort = Integer.parseInt(prefs.get("proxyPort", "0"));
      URL url = new URL(urlString);
      Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyServer, proxyPort));
      HttpURLConnection uc = (HttpURLConnection) url.openConnection(proxy);
      uc.setConnectTimeout(10000);
      uc.setReadTimeout(60000);
      uc.connect();

      String line;
      StringBuffer tmp = new StringBuffer();
      BufferedReader in = new BufferedReader(new InputStreamReader(uc.getInputStream()));
      while ((line = in.readLine()) != null) {
        tmp.append(line);
      }
      doc = Jsoup.parse(String.valueOf(tmp));
    }
    return doc;
  }
}

代码示例来源:origin: asciidoctor/asciidoctorj

@Test
public void test() throws Exception {
  HttpURLConnection conn = (HttpURLConnection) new URL(url, "asciidoctor").openConnection();
  conn.setDoOutput(true);
  conn.setRequestMethod("POST");
  conn.getOutputStream().write("Hello World".getBytes());
  byte[] buf = new byte[65535];
  try (InputStream in = conn.getInputStream()) {
    final Document doc = Jsoup.parse(readFull(in));
    final Element first = doc.body().children().first();
    assertEquals("div", first.tagName());
    assertEquals("paragraph", first.className());
    final Element paragraph = first.children().first();
    assertEquals("p", paragraph.tagName());
    assertEquals("Hello World", paragraph.ownText());
  }
}

代码示例来源:origin: kriegaex/Galileo-Openbook-Cleaner

private static SortedSet<String> getWebSiteURLs() throws Exception {
  Document webPage;
  Elements downloadLinks;
  SortedSet<String> webSiteURLs = new TreeSet<>();
  webPage = Jsoup.parse(new URL("https://www.rheinwerk-verlag.de/openbook/"), 10000);
  downloadLinks = webPage.select("a.btn-primary");
  for (Element link : downloadLinks) {
    webSiteURLs.add(link.attr("href").replaceAll(".*/", ""));
  }
  return webSiteURLs;
}

代码示例来源:origin: decaywood/XueQiuSuperSpider

private void initMap() throws Exception {
  industryMap = new HashMap<>();
  String target = URLMapper.COMPREHENSIVE_PAGE.toString();
  String content = request(new URL(target));
  Document doc = Jsoup.parse(content);
  Elements element = doc.getElementsByClass("second-nav")
      .get(1).children()
      .get(3).children()
      .get(3).children()
      .select("a");
  StringBuilder builder = new StringBuilder();
  for (Element ele : element) {
    if (!ele.hasAttr("title") || !ele.hasAttr("href")) continue;
    builder.append(ele.attr("href"));
    industryMap.put(ele.attr("title"),  new Industry(ele.attr("title"), builder.toString()));
    builder.delete(0, builder.length());
  }
}

相关文章