unix 在bash中提取多个字符串并组合为.csv

lymnna71  于 2022-11-04  发布在  Unix
关注(0)|答案(5)|浏览(147)

我有一个很长的ID列表需要解析。我想提取三条信息并写入3列CSV。第1列= tr之间的字段|XXXX年|,第3列=第二个之后的字段|但在OS=之前。
第2列是有条件的。如果行中有'GN=XXX',我希望它返回XXX。如果GN=不存在,我希望写入第3列的第一部分(即,直到第一个空格)。
输入:

>tr|I1WXP1|I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment) OS=uncultured euryarchaeote OX=114243 GN=mcrA PE=4 SV=1
>tr|A0A059VAR9|A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment) OS=Halorubrum sp. Ga66 OX=1480727 GN=atpB PE=3 SV=1
>tr|Q51760|Q51760_9EURY Glutaredoxin-like protein OS=Pyrococcus furiosus OX=2261 PE=1 SV=1

所需输出:

I1WXP1,mcrA,I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)
A0A059VAR9,atpB, A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment)
Q51760,Q51760_9EURY,Q51760_9EURY Glutaredoxin-like protein

我可以用awk得到前两个,例如:

awk '{split($0,a,"|"); print a[2]

但是我不能计算出条件,或者如何巧妙地对“GN=”模式进行操作。
例如,提取粗体文本:

tr|**I1WXP1**|**I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)**OS=uncultured euryarchaeote OX=114243 GN=**mcrA**PE=4 SV=1

变成:

I1WXP1, mcrA, I1WXP1_9EURY Methyl coenzyme M reductase subunit A(Fragment)
nxagd54h

nxagd54h1#

每当你的输入包含tag=value对时,我发现最好先创建一个数组来包含这个Map,然后你可以通过标签(名称)来访问值,比如使用任何awk:

$ cat tst.awk
BEGIN { FS="[|]"; OFS="," }
{
    delete tag2val

    description = $3; sub(/ +[^ ]+=.*/,"",description)
    assignments = substr($3,length(description)+1)

    tag2val["GN"] = description; sub(/ .*/,"",tag2val["GN"])

    split(assignments,a," ")
    for ( i in a ) {
        tag = a[i]; sub(/=.*/,"",tag)
        val = substr(a[i],length(tag)+2)
        tag2val[tag] = val
    }

    print $2, tag2val["GN"], description
}
$ awk -f tst.awk file
I1WXP1,mcrA,I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)
A0A059VAR9,atpB,A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment)
Q51760,Q51760_9EURY,Q51760_9EURY Glutaredoxin-like protein

使用这种方法,如果您想打印或测试其他字段,这是微不足道的,例如:

$ cat tst.awk
BEGIN { FS="[|]"; OFS="," }
{
    delete tag2val

    description = $3; sub(/ +[^ ]+=.*/,"",description)
    assignments = substr($3,length(description)+1)

    tag2val["GN"] = description; sub(/ .*/,"",tag2val["GN"])

    split(assignments,a," ")
    for ( i in a ) {
        tag = a[i]; sub(/=.*/,"",tag)
        val = substr(a[i],length(tag)+2)
        tag2val[tag] = val
    }

    print $2, tag2val["GN"], tag2val["OS"], tag2val["PE"], description
}
$ awk -f tst.awk file
I1WXP1,mcrA,uncultured,4,I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)
A0A059VAR9,atpB,Halorubrum,3,A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment)
Q51760,Q51760_9EURY,Pyrococcus,1,Q51760_9EURY Glutaredoxin-like protein
kuarbcqp

kuarbcqp2#

作为sed中的备选项:

sed -e '/.*tr|\([^|]*\)|\(.*\) OS=.* GN=\([^ ]*\).*/{s//\1,\3,\2/;b;}' \
    -e 's/.*tr|\([^|]*\)|\(\([^ ]*\).*\) OS=.*/\1,\3,\2/'              \
file
gg0vcinb

gg0vcinb3#

这可能对你有用(GNU sed):

sed -En 's/^[^|]*\|([^|]*)\|(.*) OS.*GN=(\S+).*/\1,\3,\2/p;t
         s/^[^|]*\|([^|]*)\|((\S+).*) OS.*/\1,\3,\2/p' file

使用模式匹配和反向引用来格式化所需的结果。
如果第一个匹配失败,请使用第二个匹配。
如果两者都不匹配,则无输出。

rur96b6h

rur96b6h5#

***第一个解决方案(GNU awk版本):***使用您显示的示例,请尝试以下awk代码。使用GNU awk中显示的示例编写并测试。

awk -F'^>tr\\|| OS=' '
BEGIN{ OFS="," }
NF>=2{
  gsub(/\|/,OFS,$2)
  match($0,/GN=(\S+)/,gnArray)
  if(gnArray[1]==""){
    match($2,/(^[^,]*),(\S+)(.*)/,NoGNfoundArr)
    val=NoGNfoundArr[1] OFS NoGNfoundArr[2] OFS NoGNfoundArr[2]  NoGNfoundArr[3]
  }
  else{
    match($2,/(^[^,]*),(.*)/,GNFoundArr)
    val=GNFoundArr[1] OFS gnArray[1] OFS GNFoundArr[2]
  }
  print val
}
'  Input_file

***第二个解决方案(适用于任何awk版本):***添加适用于任何版本的额外awk代码。

awk '
BEGIN{ OFS="," }
match($0,/^>tr.* OS=/){
  val1=substr($0,RSTART+4,RLENGTH-8)
  gsub(/\|/,",",val1)
  match($0,/GN=[^[:space:]]+/)
  val2=substr($0,RSTART+3,RLENGTH-3)
  if(val2){
    match(val1,/^[^,]*/)
    print substr(val1,RSTART,RLENGTH),val2 substr(val1,RSTART+RLENGTH)
  }
  else{
    match(val1,/,[^[:space:]]+/)
    print substr(val1,1,RSTART-1),substr(val1,RSTART+1,RLENGTH-1),substr(val1,RSTART+1,RLENGTH-1) substr(val1,RSTART+RLENGTH)
  }
}
'  Input_file

相关问题