from collections import namedtuple
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
def tmx2df(tmxfile):
# Pick your poison for parsing XML.
with open(tmxfile) as fin:
content = fin.read()
bsoup = BeautifulSoup(content, 'lxml') # Actual TMX extraction.
lol = [] # Keep a list of the rows to populate.
for tu in tqdm(bsoup.find_all('tu')):
# Parse metadata from tu
metadata = tu.attrs
# Parse prop
properties = {prop.attrs['type']:prop.text for prop in tu.find_all('prop')}
# Parse seg
segments = {}
# The order of the langauges might not be consistent,
# so keep them in some dict and unstructured first.
for tuv in tu.find_all('tuv'):
segment = ' '.join([seg.text for seg in tuv.find_all('seg')])
segments[tuv.attrs['xml:lang']] = segment
lol.append({'metadata':metadata, 'properties':properties, 'segments':segments}) # Put the list of rows into a dataframe.
df = pd.DataFrame(lol) # See https://stackoverflow.com/a/38231651
return pd.concat([df.drop(['segments'], axis=1), df['segments'].apply(pd.Series)], axis=1)
3条答案
按热度按时间vojdkbi01#
正如@hurrial所说,你可以使用translate-toolkit。
安装
这个工具包只能通过pip使用。要安装它,请运行:
用法
假设您有以下简单的
sample.tmx
文件:你可以这样解析这个简单的文件:
有关更多信息,请从这里查看官方文档。
kgsdhlau2#
您可以查看以下链接:
干杯
hjzp0vay3#
下面是一个可以轻松将TMX转换为pandas dataframe的脚本: