[英]The core public access point to the jsoup functionality.
代码示例来源:origin: loklak/loklak_server
* This is a helper function that helps user to extract html nested inside of html script
* @param raw_html
* @return nested html String
private static String getNestedHtml(String raw_html){
String html = raw_html.replace("\\","");
Document doc = Jsoup.parse(html);
//get the script tag
Elements scripts = doc.getElementsByTag("script");
//pattern for extracting html
Pattern pttrn = Pattern.compile("\"html\":\"");
String nested_html = "";
for (Element script:scripts){
Matcher m = pttrn.matcher(html = script.html());
nested_html += html.substring(m.end(), html.length() -3);
return nested_html;
代码示例来源:origin: ChinaSilence/any-video
private String getOpenId(String accessToken) throws IOException{
String url = openIdUri + accessToken;
Document document = Jsoup.connect(url).get();
String resultText = document.text();
Matcher matcher = Pattern.compile("\"openid\":\"(.*?)\"").matcher(resultText);
if (matcher.find()){
return matcher.group(1);
return null;
代码示例来源:origin: JpressProjects/jpress
public static String getFirstImageSrc(String html) {
if (StrUtils.isBlank(html))
return null;
Elements es = Jsoup.parseBodyFragment(html).select("img");
if (es != null && es.size() > 0) {
String src = es.first().attr("src");
return StrUtils.isBlank(src) ? null : src;
return null;
代码示例来源:origin: RipMeApp/ripme
if ((url.getHost().endsWith("imgur.com"))
&& url.toExternalForm().contains("imgur.com/a/")) {
try {
logger.debug("Fetching imgur album at " + url);
else if (url.getHost().endsWith("imgur.com") && url.toExternalForm().contains(",")) {
Pattern p = Pattern.compile("https?://i.reddituploads.com/([a-zA-Z0-9]+)\\?.*");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
logger.info("URL: " + url.toExternalForm());
String u = url.toExternalForm().replaceAll("&", "&");
try {
Document doc = Jsoup.connect(url.toExternalForm())
for (Element el : doc.select("meta")) {
if (el.attr("property").equals("og:video")) {
result.add(new URL(el.attr("content")));
return result;
else if (el.attr("name").equals("twitter:image:src")) {
result.add(new URL(el.attr("content")));
return result;
代码示例来源:origin: RipMeApp/ripme
Pattern p;
Matcher m;
p = Pattern.compile(IMAGE_PATTERN);
Pattern qualP = Pattern.compile("_[0-9]+\\.(jpg|png|gif|bmp)$");
Matcher qualM;
m = p.matcher(fileURL.toString());
if (m.matches()) {
downloadURL(fileURL, date);
} else {
fileURL = new URL(post.getString("video_url").replaceAll("http:", "https:"));
downloadURL(fileURL, date);
} catch (Exception e) {
Document d = Jsoup.parse(post.getString("body"));
if (!d.select("img").attr("src").isEmpty()) {
try {
String imgSrc = d.select("img").attr("src");
qualM = qualP.matcher(imgSrc);
imgSrc = qualM.replaceFirst("_1280.$1");
downloadURL(new URL(imgSrc), date);
} catch (MalformedURLException e) {
代码示例来源:origin: loklak/loklak_server
Post githubProfile = new GithubPost(profile, 0);
try {
html = Jsoup.parse(bufferedReaderToString(br));
} catch (IOException e) {
String avatarUrl = html.getElementsByAttributeValueContaining("class", "avatar").attr("src");
Pattern avatarUrlToUserId = Pattern.compile(".com\\/u\\/([0-9]+)\\?");
Matcher m = avatarUrlToUserId.matcher(avatarUrl);
userId = m.group(1);
githubProfile.put("user_id", userId);
githubProfile.put("post_type", "user");
githubProfile.put("avatar_url", "https://avatars0.githubusercontent.com/u/" + userId);
String email = html.getElementsByAttributeValueContaining("itemprop", "email").text();
if (!email.contains("@")) {
email = "";
String specialLink = html.getElementsByAttributeValueContaining("itemprop", "url").text();
githubProfile.put("special_link", specialLink);
Elements joiningDates = html.getElementsByAttributeValueContaining("class", "dropdown-item");
for (Element joiningDate: joiningDates) {
String joinDate = joiningDate.attr("href");
if (joinDate.contains("join")) {
joinDate = joinDate.substring(joinDate.length() - 10);
代码示例来源:origin: TEAMMATES/teammates
public void testTimezoneDatabasesAreUpToDate() {
// ensure the timezone databases are up-to-date
String currentTzVersion = Jsoup.parse(browser.driver.getPageSource()).getElementById("tzversion-moment").text();
Document tzReleasePage = Jsoup.parse(browser.driver.getPageSource());
String latestTzVersion = tzReleasePage.getElementById("version").text();
if (!currentTzVersion.equals(latestTzVersion)) {
// find the release day
String releaseDateString = tzReleasePage.getElementById("date").text();
Pattern datePattern = Pattern.compile("\\(Released (.+)\\)");
Matcher matcher = datePattern.matcher(releaseDateString);
LocalDate releaseDate = LocalDate.parse(matcher.group(1), DateTimeFormatter.ofPattern("yyyy-MM-dd"));
LocalDate nowDate = Instant.now().atZone(Const.DEFAULT_TIME_ZONE).toLocalDate();
"The timezone database version is not up-to-date for more than " + DAYS_TO_UPDATE_TZ + " days,"
+ " please update them according to the maintenance guide.",
代码示例来源:origin: huxq17/SwipeCardsView
private int getCount(String html) {
Document doc = Jsoup.parse(html);
Elements pages = doc.select("span");
Element page = pages.get(10);
Pattern p = Pattern.compile("[\\d*]");
Matcher m = p.matcher(page.toString());
StringBuffer stringBuffer = new StringBuffer();
while (m.find()) {
return Integer.parseInt(stringBuffer.toString());
代码示例来源:origin: jeremylong/DependencyCheck
if (HTML_DETECTION_PATTERN.matcher(description).find()) {
desc = Jsoup.parse(description).text();
} else {
desc = description;
代码示例来源:origin: iMeiji/Toutiao
private Boolean parseHTML(String HTML) {
boolean flag = false;
Document doc = Jsoup.parse(HTML);
Elements scripts = doc.getElementsByTag("script");
for (Element e : scripts) {
String script = e.toString();
if (script.contains("BASE_DATA.galleryInfo")) {
script = e.childNode(0).toString();
Matcher matcher = Pattern.compile("(JSON.parse\\(\\\".+\\))").matcher(script);
while (matcher.find()) {
int count = matcher.groupCount();
if (count >= 1) {
int start = script.indexOf("(");
代码示例来源:origin: RipMeApp/ripme
public static ImgurAlbum getImgurAlbum(URL url) throws IOException {
String strUrl = url.toExternalForm();
if (!strUrl.contains(",")) {
strUrl += "/all";
String newUrl = url.toExternalForm() + "/noscript";
LOGGER.info(" Retrieving " + newUrl);
doc = Jsoup.connect(newUrl)
for (Element thumb : doc.select("div.image")) {
String image;
if (!thumb.select("a.zoom").isEmpty()) {
image = "http:" + thumb.select("a").attr("href");
} else if (!thumb.select("img").isEmpty()) {
image = "http:" + thumb.select("img").attr("src");
} else {
image = image.replace(".gif", ".mp4");
ImgurImage imgurImage = new ImgurImage(new URL(image));
代码示例来源:origin: mygithuball/any-video
private boolean related(String url) {
try {
Document document = Jsoup.connect(url).get();
String reg = String.format(FRIEND_LINK_HTML_REG, appDomain, appName);
String html = document.html();
Matcher matcher = Pattern.compile(reg).matcher(html);
return matcher.find();
} catch (IOException e) {
log.info("Add Friend Link Error, url:" + url);
return false;
代码示例来源:origin: loklak/loklak_server
* Method to match the given pattern with extracted elements of html page
* and parse the result for the posts on the given instagram page
* @return instaProfile as a JSONArray object containing all posts and details of viewer
public JSONArray scrapeInstagram(BufferedReader br, String url) {
Document htmlPage = null;
Post instaObj = null;
JSONArray instaProfile = new JSONArray();
try {
htmlPage = Jsoup.parse(this.bufferedReaderToString(br));
} catch (IOException e) {
String script = htmlPage.getElementsByTag("script").html();
Matcher m = instaJsonData.matcher(script);
int start = m.start(1);
int end = m.start(2) + 1;
script = script.substring(start, end);
//TODO: pre-process the posts captured. At present, complete array of posts are output.
//Only useful data shall be outputted.
instaObj = new Post(script, this.query);
return instaProfile;
代码示例来源:origin: bonigarcia/webdrivermanager
String driverStr = driverUrl.toString();
String driverUrlContent = driverUrl.getPath();
org.jsoup.nodes.Document doc = Jsoup.parse(in, null, "");
Iterator<org.jsoup.nodes.Element> iterator = doc.select("a")
List<URL> urlList = new ArrayList<>();
String link = iterator.next().attr("href");
if (link.contains("mirror") && link.endsWith(SLASH)) {
urlList.addAll(getDriversFromMirror(new URL(
driverStr + link.replace(driverUrlContent, ""))));
} else if (link.startsWith(driverUrlContent)
代码示例来源:origin: HubSpot/jinjava
public Object filter(Object object, JinjavaInterpreter interpreter, String... arg) {
if (!(object instanceof String)) {
return object;
String val = interpreter.renderFlat((String) object);
String strippedVal = Jsoup.parseBodyFragment(val).text();
String normalizedVal = WHITESPACE.matcher(strippedVal).replaceAll(" ");
return normalizedVal;
代码示例来源:origin: magefree/mage
if (proxyType == ProxyType.NONE) {
urlDocument = pageUrl;
doc = Jsoup.connect(urlDocument).get();
} else {
String proxyServer = prefs.get("proxyAddress", "");
int proxyPort = Integer.parseInt(prefs.get("proxyPort", "0"));
URL url = new URL(pageUrl);
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyServer, proxyPort));
HttpURLConnection uc = (HttpURLConnection) url.openConnection(proxy);
doc = Jsoup.parse(String.valueOf(tmp));
Elements cardsImages = doc.select("img[src^=cards/]"); // starts with cards/
if (!aliasesStart.isEmpty()) {
for (String text : aliasesStart) {
cardsImages.addAll(doc.select("img[src^=" + text + ']'));
String cardLink = cardsImage.attr("src");
String cardName = null;
if (cardLink.startsWith("cards/") && cardLink.endsWith(".jpg")) {
代码示例来源:origin: magefree/mage
public static Document downloadHtmlDocument(String urlString) throws NumberFormatException, IOException {
Preferences prefs = MageFrame.getPreferences();
Connection.ProxyType proxyType = Connection.ProxyType.valueByText(prefs.get("proxyType", "None"));
Document doc;
if (proxyType == ProxyType.NONE) {
doc = Jsoup.connect(urlString).timeout(60 * 1000).get();
} else {
String proxyServer = prefs.get("proxyAddress", "");
int proxyPort = Integer.parseInt(prefs.get("proxyPort", "0"));
URL url = new URL(urlString);
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyServer, proxyPort));
HttpURLConnection uc = (HttpURLConnection) url.openConnection(proxy);
String line;
StringBuffer tmp = new StringBuffer();
BufferedReader in = new BufferedReader(new InputStreamReader(uc.getInputStream()));
while ((line = in.readLine()) != null) {
doc = Jsoup.parse(String.valueOf(tmp));
return doc;
代码示例来源:origin: asciidoctor/asciidoctorj
public void test() throws Exception {
HttpURLConnection conn = (HttpURLConnection) new URL(url, "asciidoctor").openConnection();
conn.getOutputStream().write("Hello World".getBytes());
byte[] buf = new byte[65535];
try (InputStream in = conn.getInputStream()) {
final Document doc = Jsoup.parse(readFull(in));
final Element first = doc.body().children().first();
assertEquals("div", first.tagName());
assertEquals("paragraph", first.className());
final Element paragraph = first.children().first();
assertEquals("p", paragraph.tagName());
assertEquals("Hello World", paragraph.ownText());
代码示例来源:origin: kriegaex/Galileo-Openbook-Cleaner
private static SortedSet<String> getWebSiteURLs() throws Exception {
Document webPage;
Elements downloadLinks;
SortedSet<String> webSiteURLs = new TreeSet<>();
webPage = Jsoup.parse(new URL("https://www.rheinwerk-verlag.de/openbook/"), 10000);
downloadLinks = webPage.select("a.btn-primary");
for (Element link : downloadLinks) {
webSiteURLs.add(link.attr("href").replaceAll(".*/", ""));
return webSiteURLs;
代码示例来源:origin: decaywood/XueQiuSuperSpider
private void initMap() throws Exception {
industryMap = new HashMap<>();
String target = URLMapper.COMPREHENSIVE_PAGE.toString();
String content = request(new URL(target));
Document doc = Jsoup.parse(content);
Elements element = doc.getElementsByClass("second-nav")
StringBuilder builder = new StringBuilder();
for (Element ele : element) {
if (!ele.hasAttr("title") || !ele.hasAttr("href")) continue;
industryMap.put(ele.attr("title"), new Industry(ele.attr("title"), builder.toString()));
builder.delete(0, builder.length());