寻找绝对最快的方式将整数作为单个数字写入C文件-包括微优化

guz6ccqo  于 2023-01-08  发布在  其他
关注(0)|答案(3)|浏览(90)

我正在用C语言编写一个程序,它的主要目标是绝对速度--这是一个代码性能竞赛。有更多的方法可以加速程序,但是,最大的加速潜力是在I/O操作中,特别是保存到文本文件中。该文件的结构如下:每行3个任意位数的整数,用空格分隔。整数是预先知道的,它们只需要转换成字符串并写入输出缓冲区。
整数的范围仅为-1INT_MAX
缓冲区大小根据写入的数据而变化(我设置了它),但大多数情况下,写入的文件大小为数百兆字节到超过1千兆字节,缓冲区大小在4到8 MB之间。

int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
    const size_t w_bufsize = get_bufsize(param);
    void *buf = NULL;
    posix_memalign(&buf, sysconf(_SC_PAGE_SIZE), w_bufsize)
    posix_fadvise(fd, 0, 0, POSIX_FADV_NOREUSE);
    size_t num_written = 0;
    size_t f_idx = 0;
    for (int i = 0; i < num_ints; ++i) {
        myStruct *str = main_struct->structs + i;
        f_idx = fast_write_3_ints(buf, str->int1, str->int2, str->int3, f_idx);
        if (f_idx + BYTES_PER_ROW > w_bufsize) {
            write(fd, buf, f_idx) != f_idx
            if (num_written)
                posix_fadvise(fd, (num_written - 1) * w_bufsize, w_bufsize,
                              POSIX_FADV_DONTNEED);
            f_idx = 0;
            ++num_written;
        }

(返回值检查和frees/closes为便于阅读而缩写)
为了将整数转换为文本,我使用以下方法:https://kenny-peng.com/2021/05/28/printing_integers_fast.html我通过绕过临时缓冲区并将字符直接存储到输出缓冲区来进一步改进它(在我的机器上性能提高了10- 15%)。

size_t fast_write_3_ints(char *out_buf, int num1, int num2, int num3,
                         size_t idx)
{
    char *temp_ptr = NULL;
    int n_digits = 0;
    if (num1 < 0) {
        out_buf[idx++] = '-';
        num1 = -num1;
    }
    if (num1 < 10) {
        out_buf[idx++] = num1 + '0';
    } else {
        idx += count_digits(num1);
        temp_ptr = out_buf + idx;
        for (; num1 >= 1000; num1 /= 1000) {
            temp_ptr -= 3;
            lookup_digits(temp_ptr, num1 % 1000, 3);
        }
        if (num1) {
            num1 %= 1000;
            n_digits = count_digits(num1);
            lookup_digits(temp_ptr - n_digits, num1, n_digits);
        }
    }
    out_buf[idx++] = ' ';
    // write int 2 and 3 - abbreviated
    out_buf[idx++] = '\n';
    return idx;
}

static void lookup_digits(char *arr, int num, char write_size)
{
    static const char table[3000] __attribute__((aligned(64))) =
        "000001002003004005006007008009"
        "010011012013014015016017018019"
        "020021022023024025026027028029"
        "030031032033034035036037038039"
        "040041042043044045046047048049"
        "050051052053054055056057058059"
        "060061062063064065066067068069"
        "070071072073074075076077078079"
        "080081082083084085086087088089"
        "090091092093094095096097098099"
        "100101102103104105106107108109"
        "110111112113114115116117118119"
        "120121122123124125126127128129"
        "130131132133134135136137138139"
        "140141142143144145146147148149"
        "150151152153154155156157158159"
        "160161162163164165166167168169"
        "170171172173174175176177178179"
        "180181182183184185186187188189"
        "190191192193194195196197198199"
        "200201202203204205206207208209"
        "210211212213214215216217218219"
        "220221222223224225226227228229"
        "230231232233234235236237238239"
        "240241242243244245246247248249"
        "250251252253254255256257258259"
        "260261262263264265266267268269"
        "270271272273274275276277278279"
        "280281282283284285286287288289"
        "290291292293294295296297298299"
        "300301302303304305306307308309"
        "310311312313314315316317318319"
        "320321322323324325326327328329"
        "330331332333334335336337338339"
        "340341342343344345346347348349"
        "350351352353354355356357358359"
        "360361362363364365366367368369"
        "370371372373374375376377378379"
        "380381382383384385386387388389"
        "390391392393394395396397398399"
        "400401402403404405406407408409"
        "410411412413414415416417418419"
        "420421422423424425426427428429"
        "430431432433434435436437438439"
        "440441442443444445446447448449"
        "450451452453454455456457458459"
        "460461462463464465466467468469"
        "470471472473474475476477478479"
        "480481482483484485486487488489"
        "490491492493494495496497498499"
        "500501502503504505506507508509"
        "510511512513514515516517518519"
        "520521522523524525526527528529"
        "530531532533534535536537538539"
        "540541542543544545546547548549"
        "550551552553554555556557558559"
        "560561562563564565566567568569"
        "570571572573574575576577578579"
        "580581582583584585586587588589"
        "590591592593594595596597598599"
        "600601602603604605606607608609"
        "610611612613614615616617618619"
        "620621622623624625626627628629"
        "630631632633634635636637638639"
        "640641642643644645646647648649"
        "650651652653654655656657658659"
        "660661662663664665666667668669"
        "670671672673674675676677678679"
        "680681682683684685686687688689"
        "690691692693694695696697698699"
        "700701702703704705706707708709"
        "710711712713714715716717718719"
        "720721722723724725726727728729"
        "730731732733734735736737738739"
        "740741742743744745746747748749"
        "750751752753754755756757758759"
        "760761762763764765766767768769"
        "770771772773774775776777778779"
        "780781782783784785786787788789"
        "790791792793794795796797798799"
        "800801802803804805806807808809"
        "810811812813814815816817818819"
        "820821822823824825826827828829"
        "830831832833834835836837838839"
        "840841842843844845846847848849"
        "850851852853854855856857858859"
        "860861862863864865866867868869"
        "870871872873874875876877878879"
        "880881882883884885886887888889"
        "890891892893894895896897898899"
        "900901902903904905906907908909"
        "910911912913914915916917918919"
        "920921922923924925926927928929"
        "930931932933934935936937938939"
        "940941942943944945946947948949"
        "950951952953954955956957958959"
        "960961962963964965966967968969"
        "970971972973974975976977978979"
        "980981982983984985986987988989"
        "990991992993994995996997998999";
        
    memcpy(arr, table + 3 * num + 3 - write_size, write_size);
}

static int count_digits(int num)
{
    if (num < 100000)
        if (num < 1000)
            if (num < 100)
                if (num < 10)
                    return 1;
                else
                    return 2;
            else
                return 3;
        else if (num < 10000)
            return 4;
        else
            return 5;
    else if (num < 10000000)
        if (num < 1000000)
            return 6;
        else
            return 7;
    else if (num < 100000000)
        return 8;
    else if (num < 1000000000)
        return 9;
    else
        return 10;
}

这是目前主要的生产代码。下面我将描述我尝试了哪些替代方案以及结果如何。
我还必须指出,我的电脑是一台14英寸的MacBook Pro,配有M1 Pro芯片和非常快的SSD,这使得IO操作与主计算相比完全可以忽略不计。(很可能),在那里,保存文件是目前为止最慢的一点。2我还注意到一些改变使它在我的机器上表现得更好,但在实际的求值器上表现得更差(可能取决于高速缓存大小/内存速度)。
我还尝试实现了如下所述的无查找的整数到字符串处理:https://johnnylee-sde.github.io/Fast-unsigned-integer-to-string/在我的机器上,这并没有提高性能,提高的幅度超过运行间差异。
我还尝试将表扩展到4*10000数字,但它只将我的机器上的性能提高了3- 5%,实际上使它在评估系统中变得更糟(可能会大大降低CPU/内存)。
还有什么我可以优化的吗?我已经没有主意了。历史上最快的代码版本保存到文件的速度比我的实现快18%。
一个线程解决了某个问题,但使用了不同的函数,这些函数(在我看来)速度较慢,执行的操作数更多?The fastest way to save graph to file in C或者我应该尝试将单个大型缓冲区例程集成到我的算法中,并改为写入st_blksize大小的缓冲区?非常感谢您的帮助或建议

**EDIT:**确定输出缓冲区大小的函数(将param视为要写入的行数)

size_t get_bufsize(int param)
{
    size_t bufsize = 4096;
    if (param >= 1000 && param < 10000)
        bufsize <<= 4;
    else if (param >= 10000 && param < 100000)
        bufsize <<= 6;
    else if (param >= 100000 && param < 1000000)
        bufsize <<= 8;
    else if (param >= 1000000 && param <= 5000000)
        bufsize <<= 10;
    else if (param > 5000000)
        bufsize <<= 11;
    //  printf("Buffer size: %zu\n", bufsize);
    return bufsize;
}

EDIT 2:整数的范围仅为-1到INT_MAX。

wvyml7n5

wvyml7n51#

以下是一些尝试提高代码效率的指导:

  • 如果在遗留系统上运行,则应指定O_BINARY,以确保write系统调用不会执行某些系统特定的转换。
  • 在将缓冲区刷新到磁盘时,您应该尝试只写入整数个页面,并将剩余的块移动到缓冲区的开头。分配适当数量的4K页面加上一些松弛,然后写入4K页面是分配大量页面并发出部分写入的更好方法。
  • 您的函数fast_write_3_ints有一个冗余语句num1 %= 1000;if (num1)测试,可以进一步简化它以提高处理小值的速度:
size_t fast_write_3_ints(char *out_buf, int num1, int num2, int num3,
                         size_t idx)
{
    char *temp_ptr;
    int n_digits;

    if (num1 < 0) {
        out_buf[idx++] = '-';
        num1 = -num1;
    }
    if (num1 < 1000) {
        if (num1 < 10) {
            out_buf[idx++] = num1 + '0';
        } else {
            n_digits = 2 + (num1 >= 100);
            lookup_digits(out_buf + idx, num1, n_digits));
            idx += n_digits;
        }
    } else {
        n_digits = count_digits(num1);
        idx += n_digits;
        temp_ptr = out_buf + idx;
        while (n_digits > 3) {
            int digits = num1 % 1000;
            num1 /= 1000;  // group division and modulo
            temp_ptr -= 3;
            lookup_digits(temp_ptr, digits, 3);
            n_digits -= 3;
        }
        lookup_digits(temp_ptr - n_digits, num1, n_digits);
    }
    out_buf[idx++] = ' ';
    // write int 2 and 3 - abbreviated
    out_buf[idx++] = '\n';
    return idx;
}
  • count_digits使用无分支代码可能会提高一些速度:
static int count_digits(int num) {
    return 1 + (num > 9)   + (num > 99)       + (num > 999) +
           (num > 9999)    + (num > 99999)    + (num > 999999) +
           (num > 9999999) + (num > 99999999) + (num > 999999999);
}
ni65a41a

ni65a41a2#

intint_fast32_t的对比

考虑int_fast32_t而不是int,因为64位类型可能更快。

避免使用2个值进行区间测试

一个小小的改进可能与一个简化的if树。
此外,最好先测试较大的值,因为这样更可能匹配。

uint_fast32_t get_bufsize(int param) {
    #define  BLOCK ((uint_fast32_t) 4096)
    if (param >= 5000000) {
        return BLOCK << 11;
    }
    if (param >= 1000000) {
        return BLOCK << 10;
    }
    if (param >= 100000) {
        return BLOCK << 8;
    }
    if (param >= 10000) {
        return BLOCK) << 6;
    }
    if (param >= 1000) {
        return BLOCK << 4;
    }
    return BLOCK;
}

unsignedint的比较

我从来没有遇到过使用intunsigned更快的情况,但是使用unsigned有一些更快代码的潜力。值得一试。在if (num1 < 0)测试之后,代码可以移动到 unsigned 数学,并且可能看到边际改进。
我怀疑其中任何一个都不会有显著的改善,但可能会推动一个更快的代码。

jrcvhitl

jrcvhitl3#

如果您试图优化以避免不必要地执行代码,并且唯一的负值是-1,请更改:

if (num1 < 0) {
    out_buf[idx++] = '-';
    num1 = -num1;
}
if (num1 < 10) {
    out_buf[idx++] = num1 + '0';
} else {

if (num1 < 10) {
    if (num1 < 0) num = 1, out_buf[idx++] = '-';
    out_buf[idx++] = num1 + '0';
} else {

此外,似乎您试图在某些特殊情况下处理剩余的1、2或3位数。这是不必要的。
下面的示例代码从@chqrlie“借用”了无分支函数。它还计算两位/三位数,而不是索引到LUT中。考虑一下LUT......将前100个值切片到第二个“两位”函数中,修剪前导零,并且停止对指针和计数执行《双城之战》的计算。(我并不是建议您使用这些函数,因为运算太多了。您可以使用两个不同的转换函数......也可以不使用。)最后,这个示例只处理正数,并且只转换一个。

void lookup_2_digits( char *p, int n ) { // Use a LUT... I didn't for this example
    p[1] = (char)(n % 10 + '0'); n /= 10;
    p[0] = (char)(n + '0');
}

void lookup_3_digits( char *p, int n ) { // Use a LUT... I didn't for this example
    p[2] = (char)(n % 10 + '0'); n /= 10;
    p[1] = (char)(n % 10 + '0'); n /= 10;
    p[0] = (char)(n + '0');
}

int count_digits(int n) {
    return 1+ (n > 9) +       (n > 99)       + (n > 999)
            + (n > 9999)    + (n > 99999)    + (n > 999999)
            + (n > 9999999) + (n > 99999999) + (n > 999999999);
}

void doit( int num1 ) {
    char out_buf[512] = {0};
    int idx = 0;
    idx += count_digits( num1 );
    char *temp_ptr = out_buf + idx;
    do {
        if( num1 <= 99 ) {
            if( num1 <= 9 )
                /* Can deal with -1 here */
                *--temp_ptr = num1 + '0';
            else
                lookup_2_digits( temp_ptr-2, num1 );
            num1 = 0;
        } else {
            lookup_3_digits( temp_ptr -= 3, num1 % 1000 );
            num1 /= 1000;
        }
    } while( num1 > 0 );
    puts( out_buf );
}

int main( void ) {
    doit( 2165536 );
    return 0;
}

相关问题