将核碱基表示从ASCII转换为UCSC .2bit

ymzxtsji  于 11个月前  发布在  其他
关注(0)|答案(3)|浏览(198)

非抗生素DNA序列仅由核碱基腺嘌呤(A)、胞嘧啶(C)、鸟嘌呤(G)、胸腺嘧啶(T)组成。对于人类消费,碱基可以用相应的char表示,在charchar中:ACGTacgt。然而,当需要存储长序列时,这种表示是低效的。由于只需要存储四个符号,每个符号都可以分配一个2位代码。2 UCSC规定的常用.2bit正是这样做的,使用以下编码:T = 0b00,C = 0b01,A = 0b10,G = 0b11
下面的C代码显示了为清晰起见而编写的参考实现。各种转换以char序列表示的基因组序列的开源软件通常使用由序列中的每个char索引的256条目查找表。这也与char的内部表示隔离。然而,存储器访问在能量上是昂贵的,即使访问的是片上缓存,并且通用表查找难以SIMD化。因此,如果转换可以通过简单的整数运算来完成,这将是有利的。考虑到ASCII是主要的char编码,可以限制于此。
什么是有效的计算方法来转换作为ASCII字符的核碱基到它们的.2bit-表示?

/* Convert nucleobases A, C, G, T represented as either uppercase or lowercase
   ASCII characters into UCSC .2bit-presentation. Passing any values other than
   those corresponding to 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't' results in an
   indeterminate response.
*/
unsigned int ascii_to_2bit (unsigned int a)
{
    const unsigned int UCSC_2BIT_A = 2;
    const unsigned int UCSC_2BIT_C = 1;
    const unsigned int UCSC_2BIT_G = 3;
    const unsigned int UCSC_2BIT_T = 0;

    switch (a) {
    case 'A':
    case 'a':
        return UCSC_2BIT_A;
        break;
    case 'C':
    case 'c':
        return UCSC_2BIT_C;
        break;
    case 'G':
    case 'g':
        return UCSC_2BIT_G;
        break;
    case 'T':
    case 't':
    default:
        return UCSC_2BIT_T;
        break;
    }
}

字符串

kmynzznz

kmynzznz1#

如果我们仔细观察碱基的ASCII字符的二进制代码,就会发现第1位和第2位提供了一个唯一的两位代码:A = 0b01000001 -> 0b00C = 0b01000011 -> 0b01G = 0b01000111 -> 0b11T = 0b01010100-> 0b10。类似于ASCII字符,仅在第5位不同。不幸的是,这种简单的Map并不完全匹配.2bit编码,其中A和T的代码被交换。解决这个问题的一种方法是用一个简单的四项排列表存储在一个变量中,可能在优化后分配给一个寄存器(“寄存器内查找表”):

unsigned int ascii_to_2bit_perm (unsigned int a)
{
    unsigned int perm = (2 << 0) | (1 << 2) | (0 << 4) | (3 << 6);
    return (perm >> (a & 6)) & 3;
}

字符串
另一种方法通过观察AT的位1为0,但CG为1。因此,我们可以通过对 inverse 进行XOR来交换A和T的编码。将输入的位1与初始代码的位1进行比较:

unsigned int ascii_to_2bit_twiddle (unsigned int a)
{
    return ((a >> 1) & 3) ^ (~a & 2);
}


这个版本在具有快速位域提取指令的处理器和没有桶形移位器的低端处理器上很有优势,因为只需要右移一个位位置。在乱序处理器上,这种方法提供了比置换表更高的并行度。它似乎更容易适应SIMD实现,因为所有字节通道都使用相同的移位计数。
在我仔细研究相关ASCII字符的二进制编码之前,我研究了一个简单的数学计算,对较小的乘数和除数进行简单的蛮力搜索,得到:

unsigned int ascii_to_2bit_math (unsigned int a)
{
    return ((18 * (a & 31)) % 41) & 3;
}


乘法器18对于没有快速整数乘法器的处理器来说是友好的。现代编译器可以有效地处理带有编译时常数除数的模计算,并且不需要除法。即使如此,我注意到即使是最好的编译器也很难利用非常有限的输入和输出范围,所以我不得不手动修改它来简化代码:

unsigned int ascii_to_2bit_math (unsigned int a)
{
    unsigned int t = 18 * (a & 31);
    return (t - ((t * 25) >> 10)) & 3;
}


然而,在64位平台上,这整个计算可以用64位、32条目的查找表来代替,如果这个64位表可以有效地放入寄存器中,则该查找表可以给予有竞争力的性能,这是x86-64的情况,其中它被作为立即数加载。

unsigned int ascii_to_2bit_tab (unsigned int a)
{
    uint64_t tab = ((0ULL <<  0) | (2ULL <<  2) | (0ULL <<  4) | (1ULL <<  6) |
                    (3ULL <<  8) | (0ULL << 10) | (2ULL << 12) | (3ULL << 14) |
                    (1ULL << 16) | (3ULL << 18) | (0ULL << 20) | (2ULL << 22) |
                    (3ULL << 24) | (1ULL << 26) | (2ULL << 28) | (0ULL << 30) |
                    (1ULL << 32) | (3ULL << 34) | (1ULL << 36) | (2ULL << 38) |
                    (0ULL << 40) | (1ULL << 42) | (3ULL << 44) | (0ULL << 46) |
                    (2ULL << 48) | (0ULL << 50) | (1ULL << 52) | (3ULL << 54) |
                    (0ULL << 56) | (2ULL << 58) | (3ULL << 60) | (1ULL << 62));
    return (tab >> (2 * (a & 31))) & 3;
}


我添加了我的测试框架以供参考:

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

#define ORIGINAL_MATH (1)

unsigned int ascii_to_2bit_perm (unsigned int a)
{
    unsigned int perm = (2 << 0) | (1 << 2) | (0 << 4) | (3 << 6);
    return (perm >> (a & 6)) & 3;
}

unsigned int ascii_to_2bit_twiddle (unsigned int a)
{
    return ((a >> 1) & 3) ^ (~a & 2);
}

unsigned int ascii_to_2bit_math (unsigned int a)
{
#if ORIGINAL_MATH
    return ((18 * (a & 31)) % 41) & 3;
#else // ORIGINAL_MATH
    unsigned int t = 18 * (a & 31);
    return (t - ((t * 25) >> 10)) & 3;
#endif // ORIGINAL_MATH
}

unsigned int ascii_to_2bit_tab (unsigned int a)
{
    uint64_t tab = ((0ULL <<  0) | (2ULL <<  2) | (0ULL <<  4) | (1ULL <<  6) |
                    (3ULL <<  8) | (0ULL << 10) | (2ULL << 12) | (3ULL << 14) |
                    (1ULL << 16) | (3ULL << 18) | (0ULL << 20) | (2ULL << 22) |
                    (3ULL << 24) | (1ULL << 26) | (2ULL << 28) | (0ULL << 30) |
                    (1ULL << 32) | (3ULL << 34) | (1ULL << 36) | (2ULL << 38) |
                    (0ULL << 40) | (1ULL << 42) | (3ULL << 44) | (0ULL << 46) |
                    (2ULL << 48) | (0ULL << 50) | (1ULL << 52) | (3ULL << 54) |
                    (0ULL << 56) | (2ULL << 58) | (3ULL << 60) | (1ULL << 62));
    return (tab >> (2 * (a & 31))) & 3;
}

/* Convert nucleobases A, C, G, T represented as either uppercase or lowercase
   ASCII characters into UCSC .2bit-presentation. Passing any values other than
   those corresponding to 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't' results in an
   inderminate response.
*/
unsigned int ascii_to_2bit (unsigned int a)
{
    const unsigned int UCSC_2BIT_A = 2;
    const unsigned int UCSC_2BIT_C = 1;
    const unsigned int UCSC_2BIT_G = 3;
    const unsigned int UCSC_2BIT_T = 0;

    switch (a) {
    case 'A':
    case 'a':
        return UCSC_2BIT_A;
        break;
    case 'C':
    case 'c':
        return UCSC_2BIT_C;
        break;
    case 'G':
    case 'g':
        return UCSC_2BIT_G;
        break;
    case 'T':
    case 't':
    default:
        return UCSC_2BIT_T;
        break;
    }
}

int main (void)
{
    char nucleobase[8] = {'A', 'C', 'G', 'T', 'a', 'c', 'g', 't'};
    printf ("Testing permutation variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i++) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_perm (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing bit-twiddling variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i++) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_twiddle (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing math-based variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i++) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_math (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing table-based variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i++) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_tab (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    return EXIT_SUCCESS;
}

zbq4xfa0

zbq4xfa02#

一种选择是:

unsigned int ascii_to_2bit (unsigned int a)
{
    return ((0xad - a) & 6) >> 1;
}

字符串
其优点在于,它只需要8位,不会溢出,并且不包含非常数移位,因此即使没有特定的SIMD指令,例如,在64位字中输入8个字符,也可以立即用于并行化

unsigned int ascii_to_2bit_8bytes (uint64_t a)
{
    return ((0xadadadadadadadad - a) & 0x0606060606060606) >> 1;
}


在每个字节的底部留下两个输出位。

wz8daaqr

wz8daaqr3#

下面的例程将用ASCII字符串的内容填充uint32_t值的数组,该ASCII字符串由您指定的字符填充,并将保存状态,以便能够附加第二个,第三个,使用它的方法通过一个main()例程来说明,该例程从命令行获取字符串参数,并将它们传递给TOTAL元素。例程的参数在它上面的注解中描述。

#include <ctype.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define UCSC_2BIT_T  (0)
#define UCSC_2BIT_C  (1)
#define UCSC_2BIT_A  (2)
#define UCSC_2BIT_G  (3)

#define PAIRS_PER_INTEGER   16

/**
 * Converts a string of nucleobases in ASCII form into an array
 * of 32bit integers in 2bitform.  As it should be possible to
 * continue the array of integers, a status reference is
 * maintained in order to determine determine where in the
 * integer to start putting the two bit sequences.  For this,
 * a (*state) variable is maintained (initialize it to 0) to
 * remember where to start putting bitpairs in the array.
 *
 * @param state_ref reference to the state variable to be maintained
 *               with the position on which to put the base in the
 *               array entry.
 * @param dna    string with the ASCII chain of bases.
 * @param twobit array reference to start filling.
 * @param sz     size of the array ***in array entries***.
 * @return the number of array elements written.  This allows to
 *         use a pointer and advance it at each function call
 *         with the number of entries consumed on each call.
 */
ssize_t
dna2twobit(
    int *state_ref,
    char *dna,
    uint32_t twobit[],
    size_t sz)
{
    /* local copy so we only change the state in case of
     * successful execution */
    int state = 30 - *state_ref;
    if (state == 30) *twobit = 0;
    ssize_t total_nb = 0; /* total number of array elements consumed */
    while (*dna && sz) {
        char base = toupper(*dna++);
        uint32_t tb;
        switch (base) {
        case 'A': tb = UCSC_2BIT_A; break;
        case 'C': tb = UCSC_2BIT_C; break;
        case 'T': tb = UCSC_2BIT_T; break;
        case 'G': tb = UCSC_2BIT_G; break;
        default: return -1;
        }
        *twobit |= tb << state;
        state -= 2;
        if (state < 0) {
            --sz; ++twobit;
            state += 32;
            *twobit = 0;
            total_nb++;
        }
    }
    *state_ref = 30 - state;
    return total_nb;
}

字符串
这个函数可以单独链接到任何程序中,但我提供了一个main()代码来说明函数的使用。您可以在命令行参数中使用ASCII编码的实际链来调用程序。程序将一个接一个地对它们进行编码,以演示多重转换(16个碱基适合一个32位整数,正如您在问题中发布的格式定义所述,因此,如果没有编码16的倍数,则状态反映了最后一个中覆盖了多少位。
代码继续执行下面的main函数:

#define TOTAL 16

int main(int argc, char **argv)
{
    int state = 0, i;
    uint32_t twobit_array[TOTAL], *p = twobit_array;
    size_t twobit_cap = TOTAL;

    for (i = 1; i < argc; ++i) {
        printf("Adding the following chain: [%s]\n", argv[i]);
        ssize_t n = dna2twobit(
                        &state,
                        argv[i],
                        p,
                        twobit_cap);
        if (n < 0) {
            fprintf(stderr,
                    "argument invalid: %s\n"
                    "continuing to next\n",
                    argv[i]);
            continue;
        }
        twobit_cap -= n;
        p += n;
    }
    if (!state) --p;
    uint32_t *q = twobit_array;
    size_t off = 0;
    for (int j = 0; q <= p; j++) {
        char *sep = "";
        printf("%09zd: ", off);
        for (int k = 0; k < 4 && q <= p; k++) {
            printf("%s%08x", sep, *q);
            sep = "-";
            q++;
            off += 16;
        }
        printf("\n");
    }
    return 0;
}

相关问题