from BeautifulSoup import BeautifulSoup
#Or retrieve it from the web, etc.
html_data = open('/yourwebsite/page.html','r').read()
#Create the soup object from the HTML data
soup = BeautifulSoup(html_data)
fooId = soup.find('input',name='fooId',type='hidden') #Find the proper tag
value = fooId.attrs[2][1] #The value of the third attribute of the desired tag
#or index it directly via fooId['value']
from BeautifulSoup import BeautifulSoup
#Or retrieve it from the web, etc.
html_data = open('/yourwebsite/page.html','r').read()
#Create the soup object from the HTML data
soup = BeautifulSoup(html_data)
fooId = soup.find('input',name='fooId',type='hidden') #Find the proper tag
value = fooId['value'] #The value attribute
html = """<html><body>
<input type="hidden" name="fooId" value="**[id is here]**" />
<blah>
<input name="fooId" type="hidden" value="**[id is here too]**" />
<input NAME="fooId" type="hidden" value="**[id is HERE too]**" />
<INPUT NAME="fooId" type="hidden" value="**[and id is even here TOO]**" />
<!--
<input type="hidden" name="fooId" value="**[don't report this id]**" />
-->
<foo>
</body></html>"""
from pyparsing import makeHTMLTags, withAttribute, htmlComment
# use makeHTMLTags to create tag expression - makeHTMLTags returns expressions for
# opening and closing tags, we're only interested in the opening tag
inputTag = makeHTMLTags("input")[0]
# only want input tags with special attributes
inputTag.setParseAction(withAttribute(type="hidden", name="fooId"))
# don't report tags that are commented out
inputTag.ignore(htmlComment)
# use searchString to skip through the input
foundTags = inputTag.searchString(html)
# dump out first result to show all returned tags and attributes
print foundTags[0].dump()
print
# print out the value attribute for all matched tags
for inpTag in foundTags:
print inpTag.value
图纸:
['input', ['type', 'hidden'], ['name', 'fooId'], ['value', '**[id is here]**'], True]
- empty: True
- name: fooId
- startInput: ['input', ['type', 'hidden'], ['name', 'fooId'], ['value', '**[id is here]**'], True]
- empty: True
- name: fooId
- type: hidden
- value: **[id is here]**
- type: hidden
- value: **[id is here]**
**[id is here]**
**[id is here too]**
**[id is HERE too]**
**[and id is even here TOO]**
7条答案
按热度按时间d7v8vwbk1#
对于这个特殊的例子,BeautifulSoup比regex更难写,但是它更健壮......我只是在BeautifulSoup例子中有所贡献,因为您已经知道要使用哪个regexp:-)
am46iovg2#
我同意Vinko BeautifulSoup是可行的方法,但是我建议使用
fooId['value']
来获取属性,而不是依赖于value作为第三个属性。pod7payv3#
hmtdttj44#
解析是一个如果可以避免的话,你真的不想自己动手的领域,因为你将在未来的几年里追踪边缘案例和bug
我推荐使用BeautifulSoup,它有很好的声誉,从文档看起来很容易使用。
voase2hg5#
Pyparser是BeautifulSoup和regex之间的一个很好的过渡步骤,它比regex更健壮,因为它的HTML标签解析包含了大小写、空格、属性存在/不存在/顺序的变化,但是比使用BS更容易完成这种基本的标签提取。
这个例子特别简单,因为你要找的所有东西都在开头的"input"标签的属性中。下面是一个pyparser例子,展示了input标签的几个变体,它们可以提供regex匹配,也展示了如何在标签位于注解中时不匹配它:
图纸:
您可以看到,pyparsing不仅匹配这些不可预测的变化,而且还返回一个对象中的数据,这使得读取各个标记属性及其值变得容易。
mnowg1ta6#
flseospp7#