from xml.etree import cElementTree as ET
tree = ET.parse("test.xml")
root = tree.getroot()
for page in root.findall('page'):
print("Title: ", page.find('title').text)
print("Content: ", page.find('content').text)
输出:
Title: Chapter 1
Content: Welcome to Chapter 1
Title: Chapter 2
Content: Welcome to Chapter 2
from bs4 import BeautifulSoup
import csv
data ="""<page>
<title>Chapter 1</title>
<content>Welcome to Chapter 1</content>
</page>
<page>
<title>Chapter 2</title>
<content>Welcome to Chapter 2</content>
</page>"""
soup = BeautifulSoup(data, "html.parser")
########### Title #############
required0 = soup.find_all("title")
title = []
for i in required0:
title.append(i.get_text())
########### Content #############
required0 = soup.find_all("content")
content = []
for i in required0:
content.append(i.get_text())
doc1 = list(zip(title, content))
for i in doc1:
print(i)
输出:
('Chapter 1', 'Welcome to Chapter 1')
('Chapter 2', 'Welcome to Chapter 2')
In [18]: import xml.dom.minidom
In [19]: x = """\
<root><page>
<title>Chapter 1</title>
<content>Welcome to Chapter 1</content>
</page>
<page>
<title>Chapter 2</title>
<content>Welcome to Chapter 2</content>
</page></root>"""
In [28]: doc = xml.dom.minidom.parseString(x)
In [29]: doc.getElementsByTagName("page")
Out[30]: [<DOM Element: page at 0x94d5acc>, <DOM Element: page at 0x94d5c8c>]
In [32]: [p.firstChild.wholeText for p in doc.getElementsByTagName("title") if p.firstChild.nodeType == p.TEXT_NODE]
Out[33]: [u'Chapter 1', u'Chapter 2']
In [34]: [p.firstChild.wholeText for p in doc.getElementsByTagName("content") if p.firstChild.nodeType == p.TEXT_NODE]
Out[35]: [u'Welcome to Chapter 1', u'Welcome to Chapter 2']
In [36]: for node in doc.childNodes:
if node.hasChildNodes:
for cn in node.childNodes:
if cn.hasChildNodes:
for cn2 in cn.childNodes:
if cn2.nodeType == cn2.TEXT_NODE:
print cn2.wholeText
Out[37]: Chapter 1
Welcome to Chapter 1
Chapter 2
Welcome to Chapter 2
6条答案
按热度按时间vof42yt11#
已经有一个内置的XML库,特别是
ElementTree
。s8vozzvw2#
代码:
输出:
j5fpnvbx3#
您也可以尝试以下代码来提取文本:
输出:
wz3gfoph4#
我个人更喜欢使用
xml.dom.minidom
进行解析,如下所示:oprakyz75#
推荐你一个简单的库。下面是一个例子:https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
结果:
fzsnzjdm6#
对于使用XML或HTML数据(导航、搜索和修改),我发现BeautifulSoup库非常有用。有关安装问题或详细信息,请单击link。
要查找属性(标记)或多属性值:
输出: