PCRE2_CODE_UNIT_WIDTH(8以外)

0sgqnhkj  于 2023-04-05  发布在  其他
关注(0)|答案(1)|浏览(284)

我有以下pcre2导入,我将其用作占位符/默认值:

/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
For a program that uses only one code unit width, setting it to 8, 16, or 32
makes it possible to use generic function names such as pcre2_compile(). */

#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>

在什么情况下代码宽度不是8?例如,对于非asci/utf-8字符,代码宽度应该设置为16还是8?非8位宽度的最常见用法是什么?

kmpatx3s

kmpatx3s1#

以防其他人在这个老问题上寻找答案。
这不是你的模式和/或主题是否包含unicode字符的问题,而是你使用的底层数据类型。你使用的是常规的C字符串(8位char指针)还是类似wchar_tchar16_t的东西?那么你必须使用匹配的PCRE 2代码单元宽度。
下面的例子说明了这一点。char*char16_t*都可以很好地处理相同的模式和包含unicode的主题,但是尝试使用错误的PCRE 2代码单元宽度将导致最坏的分割错误和最好的结果(或者可能更糟,因为您可能没有注意到)。
(This代码故意遗漏了一些检查和你可能想做的事情,所以请不要用它作为如何正确使用PCRE 2的例子。

// gcc -Wall pcrecuw.c -o pcrecuw -lpcre2-8 -lpcre2-16
#define PCRE2_CODE_UNIT_WIDTH 0

#include <stdio.h>
#include <uchar.h>
#include <pcre2.h>

int main() {
    pcre2_code_8* re8, *re8x ;
    pcre2_code_16* re16, *re16x ;
    pcre2_match_data_8* match_data8 ;
    pcre2_match_data_16* match_data16 ;
    int errornumber ;
    PCRE2_SIZE erroroffset ;
    int rc, i ;

    // Create two versions (8- and 16-bit) of the same pattern and subjects.
    // The pattern matches the first two subjects but not the third.
    char* pattern8 = "l(o|ö)" ;
    char* subject8[] = {"hello", "hellö", "hellô"} ;

    char16_t* pattern16 = u"l(o|ö)" ;
    char16_t* subject16[] = {u"hello", u"hellö", u"hellô"} ;

    // Length of the pattern in *code units*, without the final \0
    int lenp8 = 7 ;
    int lenp16 = 6 ;

    // Length of the subjects in *code units*, without the final \0
    int lens8[] = {5, 6, 6} ;
    int lens16[] = {5, 5, 5} ;

    // Convert these to PCRE strings in four different ways:

    // PCRE_8 with 8-bit strings (Correct)
    PCRE2_SPTR8 r_pattern8 = (PCRE2_SPTR8)pattern8 ;
    PCRE2_SPTR8 r_subject8[] = {
        (PCRE2_SPTR8)subject8[0],
        (PCRE2_SPTR8)subject8[1],
        (PCRE2_SPTR8)subject8[2]
    } ;
    
    // PCRE_16 with 16-bit strings (Correct)
    PCRE2_SPTR16 r_pattern16 = (PCRE2_SPTR16)pattern16 ;
    PCRE2_SPTR16 r_subject16[] = {
        (PCRE2_SPTR16)subject16[0],
        (PCRE2_SPTR16)subject16[1],
        (PCRE2_SPTR16)subject16[2]
    } ;

    // PCRE_8 with 16-bit strings (Wrong)
    PCRE2_SPTR8 r_pattern8x = (PCRE2_SPTR8)pattern16 ;
    PCRE2_SPTR8 r_subject8x[] = {
        (PCRE2_SPTR8)subject16[0],
        (PCRE2_SPTR8)subject16[1],
        (PCRE2_SPTR8)subject16[2]
    } ;
    
    // PCRE_16 with 8-bit strings (Wrong)
    PCRE2_SPTR16 r_pattern16x = (PCRE2_SPTR16)pattern8 ;
    PCRE2_SPTR16 r_subject16x[] = {
        (PCRE2_SPTR16)subject8[0],
        (PCRE2_SPTR16)subject8[1],
        (PCRE2_SPTR16)subject8[2]
    } ;

    // Compile
    re8 = pcre2_compile_8(
        r_pattern8, lenp8, PCRE2_UTF, &errornumber, &erroroffset, NULL
    ) ;
    re16 = pcre2_compile_16(
        r_pattern16, lenp16, PCRE2_UTF, &errornumber, &erroroffset, NULL
    ) ;
    // Note that we use the true lengths of the underlying strings in the second 
    // argument. It doesn't actually matter, however, since we get the wrong 
    // results (or errors) anyway.
    re8x = pcre2_compile_8(
        r_pattern8x, lenp16, PCRE2_UTF, &errornumber, &erroroffset, NULL
    ) ;
    re16x = pcre2_compile_16(
        r_pattern16x, lenp8, PCRE2_UTF, &errornumber, &erroroffset, NULL
    ) ;

    match_data8 = pcre2_match_data_create_from_pattern_8(re8, NULL) ;
    match_data16 = pcre2_match_data_create_from_pattern_16(re16, NULL) ;
    // Note that we can't even create match data for re8x and re16x without 
    // segmentation faults. If we didn't have the correct versions, we would be 
    // stuck at this point. By reusing the correct ones, we'll get the wrong 
    // results, but at least the code will run.

    // Matching 8 with 8 (Correct)
    for (i = 0 ; i < 3 ; i++) {
        rc = pcre2_match_8(
            re8, r_subject8[i], lens8[i], 0, 0, match_data8, NULL
        ) ;
        printf("Subject #%d: %s\n", i, ((rc >= 0) ? "Match!" : "No match!")) ;
    }
    printf("\n") ;
    
    // Matching 16 with 16 (Correct)
    for (i = 0 ; i < 3 ; i++) {
        rc = pcre2_match_16(
            re16, r_subject16[i], lens16[i], 0, 0, match_data16, NULL
        ) ;
        printf("Subject #%d: %s\n", i, ((rc >= 0) ? "Match!" : "No match!")) ;
    }
    printf("\n") ;

    // Matching 8 with 16 (Wrong)
    for (i = 0 ; i < 3 ; i++) {
        rc = pcre2_match_8(
            re8x, r_subject8x[i], lens16[i], 0, 0, match_data8, NULL
        ) ;
        printf("Subject #%d: %s\n", i, ((rc >= 0) ? "Match!" : "No match!")) ;
    }
    printf("\n") ;
    
    // Matching 16 with 8 (Wrong)
    for (i = 0 ; i < 3 ; i++) {
        rc = pcre2_match_16(
            re16x, r_subject16x[i], lens8[i], 0, 0, match_data16, NULL
        ) ;
        printf("Subject #%d: %s\n", i, ((rc >= 0) ? "Match!" : "No match!")) ;
    }

    pcre2_match_data_free_8(match_data8) ;
    pcre2_match_data_free_16(match_data16) ;

    pcre2_code_free_8(re8) ;
    pcre2_code_free_16(re16) ;
    pcre2_code_free_8(re8x) ;
    pcre2_code_free_16(re16x) ;
    return 0 ;
}

相关问题