#!/usr/bin/env perl
use warnings;
use strict;
use feature qw/state/;
use open qw/:std :locale/;
use charnames qw/:full/;
use Unicode::UCD qw/charinfo charprop/;
# Return the number of fixed-width columns taken up by a unicode codepoint
# Inspired by https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
# First adapted to use C++/ICU functions and then to perl
sub charwidth ($) {
state %cache;
my $cp = shift; # Numeric codepoint
return $cache{$cp} if exists $cache{$cp};
if ($cp == 0 || $cp == 0x200B) {
# nul and ZERO WIDTH SPACE
$cache{$cp} = 0;
return 0;
} elsif ($cp >= 0x1160 && $cp <= 0x11FF) {
# Hangul Jamo vowels and final consonants
$cache{$cp} = 0;
return 0;
} elsif ($cp == 0xAD) {
# SOFT HYPHEN
$cache{$cp} = 1;
return 1;
}
my $ci = charinfo($cp);
return undef unless defined $ci;
my $type = $ci->{category};
if ($type eq "Cc" || $type eq "Mn" || $type eq "Me" || $type eq "Cf") {
# Control Code, Non Spacing Mark, Enclosing Mark, Format Char
$cache{$cp} = 0;
return 0;
}
state $widths = { Fullwidth => 2, Wide => 2, Halfwidth => 1, Narrow => 1,
Neutral => 1, Ambiguous => 1 };
my $eaw = charprop($cp, "East_Asian_Width");
my $width = $widths->{$eaw} // 1;
$cache{$cp} = $width;
return $width;
}
sub testwidth ($) {
my $char = shift;
my $cp = ord $char;
printf "Width of %c (U+%04X %s) is %d\n", $cp, $cp, charnames::viacode($cp),
charwidth($cp);
}
testwidth "\x04";
testwidth "a";
testwidth "\N{MEDIUM BLACK CIRCLE}";
testwidth "\N{LARGE RED CIRCLE}";
testwidth "\N{U+20A9}";
testwidth "\N{U+1F637}";
使用示例:
$ ./charwidths.pl
Width of (U+0004 END OF TRANSMISSION) is 0
Width of a (U+0061 LATIN SMALL LETTER A) is 1
Width of ⚫ (U+26AB MEDIUM BLACK CIRCLE) is 2
Width of 🔴 (U+1F534 LARGE RED CIRCLE) is 2
Width of ₩ (U+20A9 WON SIGN) is 1
Width of 😷 (U+1F637 FACE WITH MEDICAL MASK) is 2
use Unicode::UCD;
use open qw/:std :locale/;
use charnames qw/:full/;
use feature 'state';
use List::Util 'reduce';
sub map_im {
my ($im, $h) = @_;
die unless $im->[2] eq 's';
my $out;
for my $i (0 .. $#{ $im->[0] }) {
my $val = $h->{ $im->[1][$i] };
my $different = @{ $out->[0] } ? ($val ne $out->[1][-1]) : defined($val);
if ($different) {
push @{ $out->[0] }, $im->[0][$i];
push @{ $out->[1] }, $val;
}
}
return $out;
}
sub merge_im {
my ($l, $r) = @_;
die unless $l->[0][0] == 0;
my $out;
my $idx_l = my $idx_r = 0;
my $val_l = $l->[1][0];
my $val_r;
while ($idx_r < @{ $r->[0] } || $idx_l < @{ $l->[0] }) {
my $newcp;
# Take from the list with the lower next entry. Or the one with entries left.
# This could probably be simplified.
if ($idx_r >= @{ $r->[0] } || ($idx_l < @{ $l->[0] }
&& $l->[0][$idx_l] <= $r->[0][$idx_r])) {
$newcp = $l->[0][$idx_l];
$val_l = $l->[1][$idx_l];
$idx_l ++;
} else {
$newcp = $r->[0][$idx_r];
$val_r = $r->[1][$idx_r];
$idx_r ++;
}
# But if they both have a transition at the same codepoint, take both so there's
# not a duplicate.
if ($idx_r < @{ $r->[0] } && $r->[0][$idx_r] == $newcp) {
$val_r = $r->[1][$idx_r];
$idx_r ++;
}
my $newval = defined($val_r) ? $val_r : $val_l;
# This gets skipped if we updated $val_l but $val_r is overriding, or
# $val_r went from undef to equaling $val_l.
if ($newval ne $out->[1][-1]) {
push @{ $out->[0] }, $newcp;
push @{ $out->[1] }, $newval;
}
}
return $out;
}
sub charwidth {
state $width_eaw = map_im([Unicode::UCD::prop_invmap('East_Asian_Width')],
{ F => 2, W => 2, H => 1, Na => 1, Neutral => 1, A => 1 }
);
state $width_cat = map_im([Unicode::UCD::prop_invmap('Category')],
{ Cc => 0, Mn => 0, Me => 0, Cf => 0 }
);
state $width_override = [
[ 0x0000, 0x0001, # NUL
0x00AD, 0x00AE, # Soft Hyphen
0x1160, 0x1200, # Hangul Jamo vowels and final consonants
0x200B, 0x200C, # ZWSP
],
[ 0, undef,
1, undef,
0, undef,
0, undef,
],
];
state $merged = reduce { merge_im($a, $b) } $width_eaw, $width_cat, $width_override;
my $cp = shift;
my $idx = Unicode::UCD::search_invlist($merged->[0], $cp);
return $merged->[1][$idx];
}
sub testwidth($) {
my $char = shift;
my $cp = ord $char;
printf "Width of %c (U+%04X %s) is %d\n", $cp, $cp,
charnames::viacode($cp), charwidth($cp);
}
testwidth "\x04";
testwidth "a";
testwidth "\N{MEDIUM BLACK CIRCLE}";
testwidth "\N{LARGE RED CIRCLE}";
testwidth "\N{U+20A9}";
testwidth "\N{U+1F637}";
2条答案
按热度按时间aiazj4mn1#
我有一些C++代码躺在周围计算字符宽度。所以,稍后快速转换到Perl,然后...
使用示例:
它只是对代码点的特定范围和类别进行一些特殊情况检查,然后使用东亚宽度属性以及TR11的建议来确定其他所有内容的宽度。
1rhkuytd2#
这是一个有点混乱,我不太确定是否要把它放到互联网上,而不把它清理到一个合适的库中...但是我不太可能 * 有时间 * 去做那个库,所以在这里它是有用的。它在很大程度上是从Shawn's contribution派生出来的,但是它没有使用每码点的“缓存”,而不是使用可能增长到数百万个条目的“缓存”,它使用Unicode::UCD数据来构建码点范围的“invmap”以及它们在第一次调用时的相关宽度;查询该Map的工作方式类似于(并且成本与单个
charprop
调用相同或略低)。map_im
接受prop_invmap
返回的invmap,并通过散列Map属性值。任何在散列中找不到的值都将成为undef
,它不被Unicode::UCD使用,但被我们的代码视为“无关”。merge_im
接受两个这样的invmap并合并它们,这样“右边”invmap中的值覆盖“左边”invmap中的值,但右边的undef范围允许左边的值“闪耀”。charwidth
的状态初始化Map并合并三个invmap(East_Asian_Width、Category和一个特殊情况覆盖列表的invmap),根据Shawn自己的charwidth
的逻辑,该函数使用Unicode::UCD自己的search_invlist
例程简单地查询。在我的笔记本电脑上,初始化需要〈60 ms,并生成一个909元素的invmap(使用来自perl5.32.1的UCD),之后每次调用查询需要大约2.5us。