html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
Beautiful Soup 定义了很多搜索方法,这里着重介绍 2 个:
find();
find_all() ;
string
、list
、regular expression
、True
、function
五种类型的过滤器;string
过滤器主要用于完全匹配属性值;list
过滤器可以极其方便的查找多个值;regular expression
过滤器可以用于不完全匹配等其他特殊匹配;True
过滤器可以用来确定存在某些属性;function
过滤器最为强大,尽管写起来比上述几个过滤器复杂,但是可以实现任何过滤;<b>
标签:soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.find_all('b'))
输出:
[<b>The Dormouse's story</b>]
<a>
标签和 <b>
标签:soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.find_all(["a", "b"]))
输出:
[<b>The Dormouse's story</b>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<body>
和 <b>
标签都应该被找到:soup = BeautifulSoup(html_doc, 'html.parser')
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
输出:
body
b
soup = BeautifulSoup(html_doc, 'html.parser')
for tag in soup.find_all(True):
print(tag.name)
输出:
html
head
title
body
p
b
p
a
a
a
p
soup = BeautifulSoup(html_doc, 'html.parser')
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
#将这个方法作为参数传入 find_all() 方法,将得到所有<p>标签:
print(soup.find_all(has_class_but_no_id))
输出:
[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]
一开始我对上述输出也有误解,明明是有 <a>
标签的,但是后来通过调试后发现,其实是因为 <a>
标签是在 <p>
标签中包含;
soup = BeautifulSoup(html_doc, 'html.parser')
def not_lacie(href):
return href and not re.compile("lacie").search(href)
print(soup.find_all(href=not_lacie))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
find_all( name , attrs , recursive , string , /*/*kwargs )
soup.find_all("title")
print(soup.find_all(id='link2'))
--- 输出 ---
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.find_all(string="Elsie"))
print(soup.find_all(string=["Tillie", "Elsie", "Lacie"]))
print(soup.find_all(string=re.compile("Dormouse")))
输出:
['Elsie']
['Elsie', 'Lacie', 'Tillie']
["The Dormouse's story", "The Dormouse's story"]
<a>
标签:soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.find_all("a", string="Elsie"))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.find_all("a", limit=2))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.find_all("a"))
print("----------------")
print(soup("a"))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
----------------
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
find( name , attrs , recursive , string , /*/*kwargs )
<body>
标签,那么使用 find_all() 方法来查找 <body>
标签就不太合适;soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.find_all('title', limit=1))
print("--------")
print(soup.find('title'))
输出:
[<title>The Dormouse's story</title>] -------- <title>The Dormouse's story</title>
从上边看出,find_all() 方法的返回结果是值包含一个元素的列表,而 find() 方法直接返回结果。
find_parents( name , attrs , recursive , string , /*/*kwargs )
find_parent( name , attrs , recursive , string , /*/*kwargs )
soup = BeautifulSoup(html_doc, 'html.parser')
string_a = soup.find(string="Lacie")
print(string_a.find_parents("a"))
print("------------------")
print(string_a.find_parent("p"))
输出:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
------------------
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
find_next_siblings( name , attrs , recursive , string , /*/*kwargs )
返回所有符合条件的后面的兄弟节点;
find_next_sibling( name , attrs , recursive , string , /*/*kwargs )
只返回符合条件的后面的第一个 tag 节点;
soup = BeautifulSoup(html_doc, 'html.parser')
link_first = soup.a
print(link_first.find_next_siblings("a"))
print("_________________")
print(link_first.find_next_sibling("a"))
输出:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
_________________
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
find_previous_siblings( name , attrs , recursive , string , /*/*kwargs )
返回所有符合条件的前面的兄弟节点;
find_previous_sibling( name , attrs , recursive , string , /*/*kwargs )
返回第一个符合条件的前面的兄弟节点;
find_all_next( name , attrs , recursive , string , /*/*kwargs )
返回所有符合条件的节点;
find_next( name , attrs , recursive , string , /*/*kwargs )
返回第一个符合条件的节点;
find_all_previous( name , attrs , recursive , string , /*/*kwargs )
返回所有符合条件的节点;
find_previous( name , attrs , recursive , string , /*/*kwargs )
返回第一个符合条件的节点;
.select()
方法中传入字符串参数,即可使用 CSS 选择器的语法找到tag。soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select("title"))
print("-----")
print(soup.select("p:nth-of-type(3)"))
输出:
[<title>The Dormouse's story</title>]
-----
[<p class="story">...</p>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select("body a"))
print("-----")
print(soup.select("html head title"))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
-----
[<title>The Dormouse's story</title>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select("head > title"))
print("-----")
print(soup.select("p > #link1"))
输出:
[<title>The Dormouse's story</title>]
-----
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select("#link1 ~ .sister"))
print("-----")
print(soup.select("#link1 + .sister"))
输出:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
-----
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select(".sister"))
print("-----")
print(soup.select("[class~=sister]"))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
-----
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select("#link1"))
print("-----")
print(soup.select("a#link2"))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
-----
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select("#link1,#link2"))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select('a[href]'))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select('a[href="http://example.com/elsie"]'))
print(soup.select('a[href^="http://example.com/"]'))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.select_one(".sister"))
输出:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
版权说明 : 本文为转载文章, 版权归原作者所有 版权申明
原文链接 : https://blog.csdn.net/S_numb/article/details/120218087
内容来源于网络,如有侵权,请联系作者删除!