我有一个包含多个序列的fasta文件,尝试计算每个序列的密码子数量,计算每个序列的rcsu值,并最终将其存储在一个Dataframe中。因此,我想制作一个嵌套字典,将序列名(specie_1,specie_2)存储为“super key”,将密码子存储为key及其相关的rcsu值(对于每个序列名),然后将其转换为一个带有pandas的Dataframe。我对Dataframe的外观有问题,它看起来不像我想要的那样。下面是python(版本3)代码:
from Bio import SeqIO
import pandas as pd
CG_nucleic_file = '/home/ju/Documents/data_128/CG_all_seq_per_species.fasta'
CodonsDict = {
'TTT': 0, 'TTC': 0, 'TTA': 0, 'TTG': 0, 'CTT': 0,
'CTC': 0, 'CTA': 0, 'CTG': 0, 'ATT': 0, 'ATC': 0,
'ATA': 0, 'ATG': 0, 'GTT': 0, 'GTC': 0, 'GTA': 0,
'GTG': 0, 'TAT': 0, 'TAC': 0, 'TAA': 0, 'TAG': 0,
'CAT': 0, 'CAC': 0, 'CAA': 0, 'CAG': 0, 'AAT': 0,
'AAC': 0, 'AAA': 0, 'AAG': 0, 'GAT': 0, 'GAC': 0,
'GAA': 0, 'GAG': 0, 'TCT': 0, 'TCC': 0, 'TCA': 0,
'TCG': 0, 'CCT': 0, 'CCC': 0, 'CCA': 0, 'CCG': 0,
'ACT': 0, 'ACC': 0, 'ACA': 0, 'ACG': 0, 'GCT': 0,
'GCC': 0, 'GCA': 0, 'GCG': 0, 'TGT': 0, 'TGC': 0,
'TGA': 0, 'TGG': 0, 'CGT': 0, 'CGC': 0, 'CGA': 0,
'CGG': 0, 'AGT': 0, 'AGC': 0, 'AGA': 0, 'AGG': 0,
'GGT': 0, 'GGC': 0, 'GGA': 0, 'GGG': 0}
# this dictionary shows which codons encode the same AA
SynonymousCodons = {
'CYS': ['TGT', 'TGC'],
'ASP': ['GAT', 'GAC'],
'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
'GLN': ['CAA', 'CAG'],
'MET': ['ATG'],
'ASN': ['AAC', 'AAT'],
'PRO': ['CCT', 'CCG', 'CCA', 'CCC'],
'LYS': ['AAG', 'AAA'],
'STOP': ['TAG', 'TGA', 'TAA'],
'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
'PHE': ['TTT', 'TTC'],
'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
'GLY': ['GGT', 'GGG', 'GGA', 'GGC'],
'ILE': ['ATC', 'ATA', 'ATT'],
'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'],
'HIS': ['CAT', 'CAC'],
'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'],
'TRP': ['TGG'],
'VAL': ['GTA', 'GTC', 'GTG', 'GTT'],
'GLU': ['GAG', 'GAA'],
'TYR': ['TAT', 'TAC']}
# DNA bases that can occupy each codon position
CodonBases = {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0}
def count_codons(fasta_file):
with open(fasta_file, 'r') as handle:
# make the codon dictionary local
codon_count = CodonsDict.copy()
# iterate over sequence and count all the codons in the FastaFile.
for cur_record in SeqIO.parse(handle, "fasta"):
# make sure the sequence is lower case
if str(cur_record.seq).islower():
dna_sequence = str(cur_record.seq).upper()
else:
dna_sequence = str(cur_record.seq)
#print(dna_sequence)
for i in range(0, len(dna_sequence), 3):
codon = dna_sequence[i:i + 3]
if codon in codon_count:
codon_count[codon] += 1
#else: raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.id))
#print(cur_record.id, codon_count)
# now to calculate the index we first need to sum the number of times
# synonymous codons were used all together.
for aa in SynonymousCodons:
total = 0.0
# RCSU values are CodonCount/((1/num of synonymous codons) * sum of
# all synonymous codons)
rcsu = []
codons = SynonymousCodons[aa]
for codon in codons:
total += codon_count[codon]
# calculate the RSCU value for each of the codons
for codon in codons:
denominator = float(total) / len(codons)
rcsu.append(codon_count[codon] / denominator)
#print(cur_record.id, codons, rcsu)
# CREATE DICTIONARY
sequences = [cur_record.id]
dic = dict(zip(codons, rcsu))
hyper_dic = dict.fromkeys(sequences, dic)
#print(hyper_dic)
# CREATE DATAFRAME
df = pd.DataFrame.from_dict(hyper_dic)
print(df)
count_codons(fasta_file=CG_nucleic_file)
打印df后,输出的一部分是:
SPECIE_1
GAA 0.457874
GAG 1.542126
SPECIE_1
TAC 1.643541
TAT 0.356459
SPECIE_2
TGC 1.322851
TGT 0.677149
SPECIE_2
GAC 1.462735
GAT 0.537265
SPECIE_2
AGC 3.089088
AGT 0.474951
TCA 0.618015
TCC 0.869249
TCG 0.652648
TCT 0.296049
SPECIE_2
CAA 0.224834
CAG 1.775166
SPECIE_2
ATG 1.0
SPECIE_2
AAC 1.747474
AAT 0.252526
SPECIE_2
CCA 0.709466
CCC 1.174944
CCG 1.734340
CCT 0.381251
SPECIE_2
AAA 0.333452
AAG 1.666548
SPECIE_2
TAA 0.439291
TAG 0.272399
TGA 2.288311
SPECIE_2
ACA 0.493505
ACC 1.937182
ACG 1.108230
ACT 0.461084
SPECIE_2
TTC 1.569263
TTT 0.430737
SPECIE_2
GCA 0.628793
GCC 1.786888
GCG 0.943387
GCT 0.640932
SPECIE_2
GGA 1.186415
GGC 1.497532
GGG 0.551426
GGT 0.764627
SPECIE_2
ATA 1.420639
ATC 1.003165
ATT 0.576195
SPECIE_2
CTA 0.272501
CTC 2.970173
CTG 1.074753
CTT 1.036392
TTA 0.295155
TTG 0.351025
SPECIE_2
CAC 1.725651
CAT 0.274349
SPECIE_2
AGA 1.119508
AGG 3.383216
CGA 0.047087
CGC 1.099417
CGG 0.175471
CGT 0.175301
SPECIE_2
TGG 1.0
SPECIE_2
GTA 0.292662
GTC 1.662430
GTG 0.616281
GTT 1.428627
Specie_2
GAA 0.456977
GAG 1.543023
Specie_2
TAC 1.644978
TAT 0.355022
而不是:
SPECIE_1 SPECIE_2 other_species...
TGC 0.567 1.076 ...
... ... ... ...
codons value value value
我刚刚开始学习python,不知道如何解决这个问题。我认为这是因为print(hyper_dic)(也叫print(sequences))输出多个嵌套字典,而不是一个:
{'SPECIE_1': {'GAG': 1.5421256842253714, 'GAA': 0.4578743157746285}}
{'SPECIE_1': {'TAT': 0.3564585901175933, 'TAC': 1.6435414098824066}}
{'SPECIE_2': {'TGT': 0.6771493242971753, 'TGC': 1.3228506757028247}}
{'SPECIE_2': {'GAT': 0.5372647945111357, 'GAC': 1.4627352054888643}}
{'SPECIE_2': {'TCT': 0.29604880551912877, 'TCG': 0.6526483836022577, 'TCA': 0.6180147100746907, 'TCC': 0.8692491020012543, 'AGC': 3.089088317463937, 'AGT': 0.4749506813387308}}
{'SPECIE_2': {'CAA': 0.22483409972224053, 'CAG': 1.7751659002777596}}
{'SPECIE_2': {'ATG': 1.0}}
{'SPECIE_2': {'AAC': 1.7474737818764585, 'AAT': 0.2525262181235415}}
{'SPECIE_2': {'CCT': 0.38125098616066655, 'CCG': 1.734339768614896, 'CCA': 0.7094656154406038, 'CCC': 1.1749436297838336}}
{'SPECIE_2': {'AAG': 1.6665484206879835, 'AAA': 0.33345157931201647}}
{'SPECIE_2': {'TAG': 0.27239861292147266, 'TGA': 2.288310666240378, 'TAA': 0.4392907208381496}}
{'SPECIE_2': {'ACC': 1.9371817149218815, 'ACA': 0.49350453967609204, 'ACG': 1.1082300904926319, 'ACT': 0.4610836549093945}}
{'SPECIE_2': {'TTT': 0.4307370502428151, 'TTC': 1.5692629497571848}}
{'SPECIE_2': {'GCA': 0.6287929079476343, 'GCC': 1.7868879496959076, 'GCG': 0.9433872796618905, 'GCT': 0.6409318626945676}}
{'SPECIE_2': {'GGT': 0.764627289015375, 'GGG': 0.5514255653469322, 'GGA': 1.1864153566270856, 'GGC': 1.497531789010607}}
{'SPECIE_2': {'ATC': 1.003165304483394, 'ATA': 1.4206394112819318, 'ATT': 0.5761952842346741}}
{'SPECIE_2': {'TTA': 0.29515482086967093, 'TTG': 0.3510247956486068, 'CTC': 2.9701734327995384, 'CTT': 1.036392020419946, 'CTG': 1.0747534868880855, 'CTA': 0.272501443374153}}
{'SPECIE_2': {'CAT': 0.2743491778419728, 'CAC': 1.7256508221580271}}
{'SPECIE_2': {'CGA': 0.047087050675736423, 'CGC': 1.0994171275614661, 'CGG': 0.17547054940030157, 'CGT': 0.17530100519164918, 'AGG': 3.38321615088407, 'AGA': 1.1195081162867764}}
{'SPECIE_2': {'TGG': 1.0}}
{'SPECIE_2': {'GTA': 0.2926620254806615, 'GTC': 1.6624303723824536, 'GTG': 0.6162805781696535, 'GTT': 1.4286270239672312}}
{'SPECIE_2': {'GAG': 1.5430230245080025, 'GAA': 0.45697697549199756}}
{'SPECIE_2': {'TAT': 0.3550221815082481, 'TAC': 1.6449778184917518}}
我读了其他的文章,但找不到答案。有人能解决这个问题吗?先谢谢你
1条答案
按热度按时间rqcrx0a61#
它可以帮助您根据您希望看到的Dataframe来考虑数据的外观。所以如果你想看:
然后您需要一个dict列表作为表单的输入:
从您想要的数据向后工作,您可以找到如何格式化数据—正确地获取一个dict并创建一个单行Dataframe可以帮助您完成这项工作,因为这样您就知道需要如何处理所有数据。