regex 使用运算符优先级解析器解析正则表达式？

我不认为正则表达式有Operator Precedence Grammar，如果是这种情况，那么您就不能使用Operator Precedence解析器。
下面是我从a Perl-style one简化而来的正则表达式语法：

<RE>    ::=     <union> | <simple-RE>
<union>     ::= <RE> "|" <simple-RE>
<simple-RE>     ::=     <concatenation> | <basic-RE>
<concatenation>     ::= <simple-RE> <basic-RE>
<basic-RE>  ::= <star> | <plus> | <elementary-RE>
<star>  ::= <elementary-RE> "*"
<plus>  ::= <elementary-RE> "+"
<elementary-RE>     ::= <group> | <any> | <char>
<group>     ::=     "(" <RE> ")"
<any>   ::=     "."
<char>  ::=     any non metacharacter | "\" metacharacter

注意，<concatenation>的右边有两个相邻的非终结符，这意味着这不是一个Operator Precedence语法。
我认为解析正则表达式的首选方法可能是Recursive Descent parser。

正则表达式确实可以用运算符优先级解析器来解析，所提供的C代码基本上使用了pratt解析器（操作符优先）方法。程序打印NFA（非确定性有限自动机）由一系列状态转换表示。输入是一个正则表达式，通常带有 *？（）+.[]运算符。pratt解析器函数 parse（） 首先为正则表达式创建一个表达式树，它处理两个连续的字符（或括号中的组），就好像它们之间有一个不可见的二元CONCAT操作符。之后，函数 print_nfa（） 从右到左遍历表达式树，并动态打印NFA状态转换。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_PREC 10
#define MAX_CHAR 255 // code for chars from 0 to incl. MAX_CHAR

enum Type { CHAR = MAX_CHAR+1, CONCAT, ANY, SET };

typedef unsigned short IType;

struct Node {
    IType type, ch, state;
    struct Node *left, *right;
    char charset[1];
};

static char *regex;
static IType state;
static struct Node *parse(int prec, struct Node *left);

static int next_token(void) { return *regex ? *regex++ : 0; }

struct Node *allocnode(IType type, IType ch, int extra)
{
    struct Node *node = calloc(1, sizeof(struct Node) + extra);
    node->type  = type;
    node->ch    = ch;
    node->state = state++;
    return node;
}

static int getprec(int op) // get the precedence level of op (highest precedence is 0)
{
    switch (op) {
        case '[': case '(': return 0;           // brackets and parenthesis are like literals: highest precedence
        case '*': case '+': case '?': return 1; // postfix operators
        case CONCAT: default: return 2;         // concat
        case '|': return 3;                     // alternate operator is lowest precedence
        case ')': return 4;                     // closing parenthesis is even lower
    }
}

static struct Node *charset(void)
{
    struct Node *node;
    int         i;

    node = allocnode(SET, 0, 1024);

    // scan until end of set
    for (i=0; regex[0] != ']' && regex[0] != '\0'; ++regex, ++i)
        node->charset[i] = regex[0];
    node->charset[i] = '\0';

    if (regex[0] == ']') // eat terminating ]
        ++regex;

    return node;
}

static struct Node *paren(struct Node *left)
{
    struct Node *node = parse(getprec(')'), left);
    if (regex[0] == ')')
        ++regex;

    return node;
}

static struct Node *single(void)
{
    if (regex[-1] == '.')
        return allocnode(ANY, 0, 0);
    else
        return allocnode(CHAR, regex[-1], 0);
}

static struct Node *postfix(int tok, struct Node *left)
{
    struct Node *node = allocnode((IType) tok, 0, 0);
    node->left = left;
    return node;
}

static struct Node *binary(int tok, struct Node *left)
{
    struct Node *node = allocnode((IType) tok, 0, 0);
    node->left = left;
    node->right = parse(getprec(tok) + 1, NULL);
    return node;
}

static struct Node *parse(int prec, struct Node *left)
{
    int tok = next_token();

    // handle prefix
    switch (tok) {
        case '[': left = charset(); break;
        case '(': left = paren(NULL); break;
        case '*': case '+': case '?': break;
        case '|': break;
        case '\0': return left;
        default: left = single(); break;
    }

    // handle infix and postfix operators
    while (*regex && getprec(regex[0]) < prec) {
        tok = next_token();
        switch (tok) {
            case '*': case '+': case '?': left = postfix(tok, left); break;
            case '|': left = binary('|', left); break;

            // concat is like an invisible infix operator (that's why we -- here)
            default: --regex; left = binary(CONCAT, left); break;
        }
    }

    return left;
}

static IType print_nfa(struct Node *root, IType final)
{
    IType temp;

    if (root == NULL)
        return 0;

    switch (root->type) {
        case CONCAT: // build two nfa's that link left-->right
            return print_nfa(root->left, print_nfa(root->right, final));

        case '|':
            // build two nfa's that both link to final: left-->final and right-->final
            // create two epsilon transitions to the two nfa's
            printf("(%d, ε, %d)\n", root->state, print_nfa(root->left, final));
            printf("(%d, ε, %d)\n", root->state, print_nfa(root->right, final));
            return root->state;

            case '*':
            case '+':
            case '?':
                // build an nfa that links to a new state (root): left-->root (or final in case of ?)
                temp = print_nfa(root->left, (IType) (root->type=='?'?final:root->state));
                // create two epsilon transitions one to the nfa above and one to final
                printf("(%d, ε, %d)\n", root->state, temp);
                printf("(%d, ε, %d)\n", root->state, final);
                if (root->type == '+')
                    return temp; // return the start state of the nfa as the start state into this
                else
                    return root->state; // return root as the start state into this

            case CHAR: // create a transition from this state (root) to final
                printf("(%d, %d, %d)\n", root->state, root->ch, final);
                return root->state;

            case ANY:
                printf("(%d, ANY, %d)\n", root->state, final);
                return root->state;

            case SET:
                printf("(%d, [%s], %d)\n", root->state, root->charset, final);
                return root->state;

            default:   break;
    }
    return 0;
}

int main(int argc, char *argv[])
{
    struct Node *root;
    IType       final, start;

    if (argc < 2) {
        printf("usage: %s regex\n", argv[0]);
        return 0;
    }

    regex = argv[1];

    root = parse(MAX_PREC, NULL);
    start = print_nfa(root, final = state++);
    printf("start = %d, final = %d\n", start, final);
    return 0;
}

该方法结合了两种算法：Vaughan Pratt的自顶向下运算符优先级解析方案（如Nystrom的书“Crafting Interpreters”第17章中所述）和Thompson NFA算法（如鼓舞人心的文章“Regular Expression Matching Can Be Simple And Fast”中所述）（https://swtch.com/~rsc/regexp/regexp1.html）。由于我没有在一个程序中找到这两种方法的组合，我决定分享我想出来的，即使最初的问题是5年前的。

regex 使用运算符优先级解析器解析正则表达式？

2条答案

相关问题

热门标签

最新问答