import re
text = ...
# Extract between [#CB: and ]
cb_data = re.findall(r"\[#CB:(.*?)\]", text)
print(cb_data) # prints out CB
# Extract between [CR: and ]
cr_data = re.findall(r"\[CR:(.*?)\]", text)
print(cr_data) # prints out CR
如果您的数据始终是XML格式:
import xml.etree.ElementTree as ET
xml_str1 = '<Order><Remark>Food was good</Remark><UserID>7890</UserID><Filter>[#CB:Customer happy with service]</Filter><Rating>Five</Rating></Order>'
xml_str2 = '<Order><UserID>7880</UserID><Filter>[#CB:Customer had a good time</Filter><Remark>Food was good</Remark><Additional>Service up to par</Additional><Remark>(#CB:Customer will return again]</Remark><End>Thats all</End></Order>'
root1 = ET.fromstring(xml_str1)
root2 = ET.fromstring(xml_str2)
cb_filter1 = root1.find('Filter').text
cb_data1 = cb_filter1[cb_filter1.find('#CB:')+4:cb_filter1.find(']')]
cb_filter2 = root2.find('Filter').text
cb_data2 = cb_filter2[cb_filter2.find('#CB:')+4:cb_filter2.find(']')]
print(cb_data1) # Customer happy with service
print(cb_data2) # Customer had a good time
import re
import pandas as pd
p = re.compile(r"\[(.*?)\]")
s1 = "<Order No. 10>Food was good7890[#CB:Customer happy with service]Five"
s2 = " Five<Order No. 17>7880[#CB:Customer had a good time]Food was goodService up to par[#CB:Customer will return again]Thats all"
print(p.findall(s1))
print(p.findall(s2))
d = pd.DataFrame({'scenario':[1,2], 'order_id':['1234', '1235'], 'reviews':[s1,s2] })
def padList(l, length=4):
l = l[:length]
l += [''] * (length - len(l))
return l
d[['review1','review2','review3','review4']] = d.apply(lambda row: padList(p.findall(row['reviews'])), axis=1, result_type='expand')
d
2条答案
按热度按时间68de4m5k1#
数据似乎是XML格式的(尽管〈Order No. 10〉.使其无效;我已经将其与交换。您可以遍历每个值并检查它们是否以#CB开始。或者,您可以执行如下所示的正则表达式搜索。但是,如果您确信数据始终遵循XML格式,则前一种方法更合适。
如果您的数据始终是XML格式:
btqmn9zl2#
在另一个答案的基础上,这就是你如何以你要求的格式得到它。