python 如何在nestedExpr pyparser中保留空格

q3qa4bjr  于 2023-01-29  发布在  Python
关注(0)|答案(1)|浏览(193)

我有一个类似的维基文本

data = """
    {{hello}}
    
    {{hello world}}
    {{hello much { }}
    {{a {{b}}}}
    
    {{a
    
    td {
        
    }
    {{inner}}
    }}
"""

我想提取其中的宏宏是一个包含在{{}}之间的文本
所以我尝试使用nestedExpr

from pyparsing import *
import pprint

def getMacroCandidates(txt):

    candidates = []

    def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.copy()):
        if opener == closer:
            raise ValueError("opening and closing strings cannot be the same")
        if content is None:
            if isinstance(opener,str) and isinstance(closer,str):
                if ignoreExpr is not None:
                    content = (Combine(OneOrMore(~ignoreExpr + 
                                    ~Literal(opener) + ~Literal(closer) +
                                    CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1))
                                ).setParseAction(lambda t:t[0]))
        ret = Forward()
        ret <<= Group( opener + ZeroOrMore( ignoreExpr | ret | content ) + closer )
  
        ret.setName('nested %s%s expression' % (opener,closer))
        return ret

    # use {}'s for nested lists
    macro = nestedExpr("{{", "}}")
    # print(( (nestedItems+stringEnd).parseString(data).asList() ))
    for toks, preloc, nextloc in macro.scanString(data):
        print(toks)
    return candidates

data = """
{{hello}}

{{hello world}}
{{hello much { }}
{{a {{b}}}}

{{a

td {
    
}
{{inner}}
}}
"""

getMacroCandidates(data)

这样就去掉了标记和空格

[['{{', 'hello', '}}']]
[['{{', 'hello', 'world', '}}']]
[['{{', 'hello', 'much', '{', '}}']]
[['{{', 'a', ['{{', 'b', '}}'], '}}']]
[['{{', 'a', 'td', '{', '}', ['{{', 'inner', '}}'], '}}']]
lg40wkob

lg40wkob1#

你能不能

data = """
{{hello}}

{{hello world}}
{{hello much { }}
{{a {{b}}}}

{{a

td {

}
{{inner}}
}}
"""

import shlex
data1= data.replace("{{",'"')
data2 = data1.replace("}}",'"')
data3=   data2.replace("}"," ")
data4=   data3.replace("{"," ")
data5= ' '.join(data4.split())
print(shlex.split(data5.replace("\n"," ")))

产出
这将返回所有删除了大括号和白色以及额外行空间的标记

['hello', 'hello world', 'hello much ', 'a b', 'a td inner ']

PS:这个可以做成单个表达式,多个表达式使用是为了可读性

相关问题