import os
import pandas as pd
def get_data_dict(file_path: str, to_replace:str="##########") -> dict:
"""get data headers from a mol2 file (assumes header lines start with `to_replace`)
:parameter
- file_path:
path to the file of interest
- to_replace:
the 'marker' that specifies the header
:return
- data_dict
dict containing the header parts as keys and their values as value
"""
data_dict = {}
# open and read file until all header info is read
with open(file_path, "r") as mol2_file:
for line in mol2_file:
# contains header info
if line.startswith(to_replace):
key = None
value = None
for i in line.split(":"):
i = i.strip()
if to_replace in i:
key = i.replace(to_replace, "").strip()
else:
value = i
data_dict[key] = [value]
# header ended
else:
return data_dict
all_data = pd.DataFrame()
# path to the directory containing all files
parent_path = "/PARENT/PATH"
for file in os.listdir(parent_path):
data_df = pd.DataFrame.from_dict(get_data_dict(os.path.join(parent_path, file)))
all_data = pd.concat([all_data, data_df])
1条答案
按热度按时间3hvapo4f1#
这会将所有头文件保存在pandas DataFrame中,然后您可以根据自己的喜好对其进行操作(例如:保存为csv或执行其他操作)。对于那些来这里寻找如何解析mol2文件的解决方案的人来说,你可以用biopandas来做。