linux 字符串使用的字符单元格数

lsmd5eda 于 2023-03-01 发布在 Linux

关注(0)|答案(6)|浏览(163)

我有一个使用UTF-8字符串输出文本表的程序，我需要测量字符串使用的等宽字符单元格的数量，以便正确对齐，如果可能的话，我希望使用标准函数来完成这项工作。

linux

来源：https://stackoverflow.com/questions/5117393/number-of-character-cells-used-by-string

6条答案

按热度按时间

7nbnzgx91#

来自Unix/Linux的UTF-8和Unicode常见问题解答：
在C语言中，可以使用mbstowcs(NULL,s,0)以可移植的方式计算字符数。只要选择了适当的语言环境，这对UTF-8和其他支持的编码一样有效。计算UTF-8字符串中字符数的硬连线技术是计算除0x 80 - 0xBF范围外的所有字节。因为这些只是连续字节而不是它们自己的字符。然而，在应用程序中，对字符计数的需求却出奇地少。

赞(0）回复(0）举报 2023-03-01

11dmarpk2#

您可能有也可能没有UTF-8兼容的strlen（3）函数，但是有一些ome simple C functions readily available可以快速完成这项工作。
高效的C解决方案检查字符的开头以跳过连续字节。

int my_strlen_utf8_c(char *s) {
   int i = 0, j = 0;
   while (s[i]) {
     if ((s[i] & 0xc0) != 0x80) j++;
     i++;
   }
   return j;
}

更快的版本使用相同的技术，但是预取数据和进行多字节比较，结果是一个实质性的加速，但是代码更长更复杂。

赞(0）回复(0）举报 2023-03-01

9wbgstp73#

我很震惊没有人提到这一点，所以在这里记录：
如果你想在终端中对齐文本，你需要使用POSIX函数wcwidth和wcswidth。下面是正确的程序来查找字符串在屏幕上的长度。

#define _XOPEN_SOURCE
#include <wchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>

int measure(char *string) {
    // allocate enough memory to hold the wide string
    size_t needed = mbstowcs(NULL, string, 0) + 1;
    wchar_t *wcstring = malloc(needed * sizeof *wcstring);
    if (!wcstring) return -1;

    // change encodings
    if (mbstowcs(wcstring, string, needed) == (size_t)-1) return -2;

    // measure width
    int width = wcswidth(wcstring, needed);

    free(wcstring);
    return width;
}

int main(int argc, char **argv) {
    setlocale(LC_ALL, "");

    for (int i = 1; i < argc; i++) {
        printf("%s: %d\n", argv[i], measure(argv[i]));
    }
}

下面是它运行的一个示例：

$ ./measure hello 莊子 cＡb
hello: 5
莊子: 4
cＡb: 4

Note how the two characters "莊子" and the three characters "cAb" (note the double-width A) are both 4 columns wide.
正如utf8everywhere.org所说，
字符串在屏幕上显示的大小与字符串中的代码点数无关。为此，必须与渲染引擎进行通信。即使在等宽字体和终端中，代码点数也不占用一列。POSIX考虑到了这一点。
Windows没有任何用于控制台输出的内置wcwidth函数;如果你想在Windows控制台中支持多列字符，你需要找到wcwidth的可移植实现给予，因为Windows控制台不支持Unicode，除非有疯狂的黑客攻击。

赞(0）回复(0）举报 2023-03-01

cbeh67ev4#

如果您能够使用第三方库，请查看IBM的ICU library。

赞(0）回复(0）举报 2023-03-01

z5btuh9x5#

下面的代码考虑了格式错误的字节序列。字符串数据的示例来自Unicode标准6.3中的“"Table 3-8. Use of U+FFFD in UTF-8 Conversion"“。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

#define is_trail(c) (c > 0x7F && c < 0xC0)
#define SUCCESS 1
#define FAILURE -1

int utf8_get_next_char(const unsigned char*, size_t, size_t*, int*, unsigned int*);
int utf8_length(unsigned char*, size_t);
void utf8_print_each_char(unsigned char*, size_t);

int main(void)
{
    unsigned char *str;
    str = (unsigned char *) "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64";
    size_t str_size = strlen((const char*) str);

    puts(10 == utf8_length(str, str_size) ? "true" : "false");
    utf8_print_each_char(str, str_size);

    return EXIT_SUCCESS;
}

int utf8_length(unsigned char *str, size_t str_size)
{
    int length = 0;
    size_t pos = 0;
    size_t next_pos = 0;
    int is_valid = 0;
    unsigned int code_point = 0;

    while (
        utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS
    ) {
        ++length;
    }

    return length;
}

void utf8_print_each_char(unsigned char *str, size_t str_size)
{
    int length = 0;
    size_t pos = 0;
    size_t next_pos = 0;
    int is_valid = 0;
    unsigned int code_point = 0;

    while (
        utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS
    ) {
        if (is_valid == true) {
            printf("%.*s\n", (int) next_pos - (int) pos, str + pos);
        } else {
            puts("\xEF\xBF\xBD");
        }

        pos = next_pos;
    }
}

int utf8_get_next_char(const unsigned char *str, size_t str_size, size_t *cursor, int *is_valid, unsigned int *code_point)
{
    size_t pos = *cursor;
    size_t rest_size = str_size - pos;
    unsigned char c;
    unsigned char min;
    unsigned char max;

    *code_point = 0;
    *is_valid = SUCCESS;

    if (*cursor >= str_size) {
        return FAILURE;
    }

    c = str[pos];

    if (rest_size < 1) {
        *is_valid = false;
        pos += 1;
    } else if (c < 0x80) {
        *code_point = str[pos];
        *is_valid = true;
        pos += 1;
    } else if (c < 0xC2) {
        *is_valid = false;
        pos += 1;
    } else if (c < 0xE0) {

        if (rest_size < 2 || !is_trail(str[pos + 1])) {
            *is_valid = false;
            pos += 1;
        } else {
            *code_point = ((str[pos] & 0x1F) << 6) | (str[pos + 1] & 0x3F);
            *is_valid = true;
            pos += 2;
        }

    } else if (c < 0xF0) {

        min = (c == 0xE0) ? 0xA0 : 0x80;
        max = (c == 0xED) ? 0x9F : 0xBF;

        if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) {
            *is_valid = false;
            pos += 1;         
        } else if (rest_size < 3 || !is_trail(str[pos + 2])) {
            *is_valid = false;
            pos += 2;
        } else {
            *code_point = ((str[pos]     & 0x1F) << 12) 
                       | ((str[pos + 1] & 0x3F) <<  6) 
                       |  (str[pos + 2] & 0x3F);
            *is_valid = true;
            pos += 3;
        }

    } else if (c < 0xF5) {

        min = (c == 0xF0) ? 0x90 : 0x80;
        max = (c == 0xF4) ? 0x8F : 0xBF;

        if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) {
            *is_valid = false;
            pos += 1;
        } else if (rest_size < 3 || !is_trail(str[pos + 2])) {
            *is_valid = false;
            pos += 2;
        } else if (rest_size < 4 || !is_trail(str[pos + 3])) {
            *is_valid = false;
            pos += 3;
        } else {
            *code_point = ((str[pos]     &  0x7) << 18)
                       | ((str[pos + 1] & 0x3F) << 12)
                       | ((str[pos + 2] & 0x3F) << 6)
                       |  (str[pos + 3] & 0x3F);
            *is_valid = true;
            pos += 4;
        }

    } else {
        *is_valid = false;
        pos += 1;
    }

    *cursor = pos;

    return SUCCESS;
}

当我为UTF-8编写代码时，我看到了Unicode标准6.3中的“表3-7.格式良好的UTF-8字节序列”。

Code Points    First Byte Second Byte Third Byte Fourth Byte
  U+0000 -   U+007F   00 - 7F
  U+0080 -   U+07FF   C2 - DF    80 - BF
  U+0800 -   U+0FFF   E0         A0 - BF     80 - BF
  U+1000 -   U+CFFF   E1 - EC    80 - BF     80 - BF
  U+D000 -   U+D7FF   ED         80 - 9F     80 - BF
  U+E000 -   U+FFFF   EE - EF    80 - BF     80 - BF
 U+10000 -  U+3FFFF   F0         90 - BF     80 - BF    80 - BF
 U+40000 -  U+FFFFF   F1 - F3    80 - BF     80 - BF    80 - BF
U+100000 - U+10FFFF   F4         80 - 8F     80 - BF    80 - BF

赞(0）回复(0）举报 2023-03-01

wqnecbli6#

您还可以使用glib，这使您在处理UTF-8.glib reference docs时更加轻松

赞(0）回复(0）举报 2023-03-01