在C中将字符串拆分为相等的整数

nr9pn0ug  于 2023-03-01  发布在  其他
关注(0)|答案(2)|浏览(176)

我正在尝试将一个16位长的字符数组1和0拆分为2个等长的整数,以便将8位二进制转换为十进制。
示例:char* str =“0001011011110000”预期结果:

int s = 00010110;
int t = 11110000;

完整代码:它做什么:用户输入一串DNA(例如:ATTCGG)。如果字符串不能被4整除,它将使用strcat()填充额外的字符。然后,它将每个字符转换为新字符数组xtr[64]中的一个两位字符串。然后,必须将该数组转换为两个等长的8位二进制整数,以便转换为表示DNA字符串的两个十进制数。基本上,DNA二进制压缩就是赋值。

int main()
{
    char str[64];
    scanf("%s", str);
    int obe = strlen(str);
    int mod = obe % 4;
    if (mod != 0) {
        for (int i = mod; i > 0; i--) {
            strcat(str, "0");
        }
    }
    int j;
    char xtr[64] = "";
    for (j = 0; j < strlen(str); j++) {
        if (str[j] == 'A') {
            strcat(xtr, "0");
            strcat(xtr, "0");
        } else if (str[j] == 'T') {
            strcat(xtr, "0");
            strcat(xtr, "1");
        } else if (str[j] == 'C') {
            strcat(xtr, "1");
            strcat(xtr, "0");
        } else if (str[j] == 'G') {
            strcat(xtr, "1");
            strcat(xtr, "1");
        } else if (str[j] == '0') {
            strcat(xtr, "0");
            strcat(xtr, "0");
        }
    }
    
    int k = strlen(xtr) / 2;
    char ret[64];
    for (int i = 0; i < k; i++) {
        ret[i] = xtr[i];
    }
    
    char ter[64];
    for (int i = k + 1; i < strlen(xtr); i++) {
        ter[i] = xtr[i];  
    }
    int s = atoi(ret);
    int t = atoi(ter);
    printf("%s", str);
    printf("\n");
    printf("%s", xtr);
    printf("\n");
    printf("%d", s);
    printf("\n");
    printf("%d", t);
    
    
}

结果:附件TCGG00 000101101110000 10110 0
问题:第二个整数没有被正确转换,并且这个代码非常原始。可能需要位运算符。

wfveoks0

wfveoks01#

#include <stdio.h>

int parseBitChars(char* str, int bitCount) {
  int ret = 0;
  for (int i = 0; i != bitCount; i++) ret = (ret << 1) | (str[i] == '1' ? 1 : 0);
  return ret;
}

int main() {
  char* str = "0001011011110000";

  // Parse whole string in one go
  printf("Value: %d\n", parseBitChars(str, 16)); // Value: 5872
  
  // Or split into bytes
  int a = parseBitChars(str, 8);
  int b = parseBitChars(str + 8, 8);
  printf("Bytes: %d %d\n", a, b); // Bytes: 22 240
}
vsmadaxz

vsmadaxz2#

下面是代码的修改版本,其中包含注解:

#include <stdio.h>
#include <string.h>

int main( void ) {
    char str[ 32 + 1 ]; // Up to 32 bases (plus terminator)
    char xtr[ 64 + 1 ] = ""; // Expands to 64
    int obe;

    scanf( "%32s%n", str, &obe ); // Limit user entry

    for( int i = obe % 4; i > 0; i-- )
        strcat( str, "A" ); // Pad (with 'A') to multiple of 4

    // Convert bases to binary values in a string
    for( int j = 0; str[ j ]; j++ )
             if ( str[j] == 'A' ) strcat( xtr, "00" );
        else if ( str[j] == 'T' ) strcat( xtr, "01" );
        else if ( str[j] == 'C' ) strcat( xtr, "10" );
        else if ( str[j] == 'G' ) strcat( xtr, "11" );

    // Output in blocks of 8 digits.
    for( int k = 0, len = strlen( xtr ); k < len; k += 8 )
        printf( "%d - %.8s\n", k, xtr + k );

    return 0;
}
ATTCGG
0 - 00010110
8 - 11110000

将DNA序列转换为中间字符串是不必要的。
幸运的是,字母"A"、"C"、"G"和"T"的ASCII代码在第1位和第2位编码得足够好。这个"编码"与你的不同,分配不同的位模式来代表每个碱基。

'A' = 0bxxxxx00x ==> 0 // 'x' == "don't care"
'C' = 0bxxxxx01x ==> 2
'G' = 0bxxxxx11x ==> 6
'T' = 0bxxxxx10x ==> 4

缺点是传统的"ACGT"交换了最后两个碱基的顺序。
此"交换"可以通过使用精心编制的8位十六进制值的转换"取消交换"。
浏览以下代码并学习下面的演示字符串:

#include <stdio.h>

void demo( char *p ) { // chunks of bases into registers
    puts( p );
    while( *p ) {
//      unsigned char  asBits = 0; //  4 bases/chunk
//      unsigned short asBits = 0; //  8 bases/chunk
        unsigned int   asBits = 0; // 16 bases/chunk
//      unsigned long  asBits = 0; // 32 bases/chunk
        const int pack = sizeof(asBits) * 4;

        // The ASCII for each of ACGT is pretty fortunate; can be hashed to two bits 0-3.
        // 0xB4: (0b10110100) 4 pairs of bits crafted to correspond to "GTCA" (reversed for shifting.)
        // Note that T&G are swapped by that 'magic byte' to conform to conventional "ACGT"
        // "AND"ing with 6 masks for the two fortunate bits,
        // "0xB4" is right shifted 0, 2, 6 or 4 bits,
        // that is then masked (3&) for its lowest two bits.
        // 'A'->0b00, 'C'->0b01, 'G'->0b10' and 'T'->0b11
        // The accumulator is shifted and this pair OR'd where they belong.

        int i;
        for( i = pack; *p && i; p++, i-- )
            asBits = asBits<<2 | (3 & (0xB4>>(*p&6))); // using one of several mapping functions

        // Sequence may not be modulo 16, so tack on extra 0b00 to pad as needed
        asBits <<= i+i; // padding for stragglers

        // Playback for verification
        printf( "%0*X - ", pack/2, asBits );
        for( int j = pack+pack-2; j >= 0; j -= 2 )
            putchar( "ACGT"[(asBits>>j)&3] );
        putchar( '\n' );
    }
}

int main( void ) {
/*
    Some bonus alternative translation functions
    char *cp;
#   define M1 "\0\1\3\2"[*cp>>1&3]
#   define M2 "\0\0\0\1\3\0\0\2"[*cp&7]
#   define M3 3&0x8340>>(*cp<<1&0xF)
#   define M4 3&0xB4>>(*cp&6)
    char *n = "0123";
    for( cp = "ACGT"; *cp; cp++ ) printf( "%c %c%c%c%c\n", *cp, n[M1], n[M2], n[M3], n[M4] );
*/
    demo( "TGCTTGCCTGCATGCA" ); // 16 bases
    demo( "TTGCTTGCCTGCATGCT" ); // 17 bases

    demo( "T" ); // 1-4 bases
    demo( "AT" );
    demo( "AAT" );
    demo( "AAAT" );
    // lots of bases
    demo( "CATCATCATCATCATCATCATCATCATCATCATCATCATCATCAT" );

    return 0;
}

输出演示:

TGCTTGCCTGCATGCA
E7E5E4E4 - TGCTTGCCTGCATGCA

TTGCTTGCCTGCATGCT
F9F97939 - TTGCTTGCCTGCATGC
C0000000 - TAAAAAAAAAAAAAAA

T
C0000000 - TAAAAAAAAAAAAAAA

AT
30000000 - ATAAAAAAAAAAAAAA

AAT
0C000000 - AATAAAAAAAAAAAAA

AAAT
03000000 - AAATAAAAAAAAAAAA

CATCATCATCATCATCATCATCATCATCATCATCATCATCATCAT
4D34D34D - CATCATCATCATCATC
34D34D34 - ATCATCATCATCATCA
D34D34C0 - TCATCATCATCATAAA

玩一会儿这个。

    • 编辑:**

这是核心处理的另一个版本,它同时将4个碱基的批次转换为1和0的字符串,并显示十进制等价物。

unsigned char four = 0;

    // Convert bases to binary values in a string
    int j = 0;
    while( str[ j ] ) {
             if ( str[j] == 'A' ) strcat( xtr, "00" ), four = (four << 2) | 0;
        else if ( str[j] == 'T' ) strcat( xtr, "01" ), four = (four << 2) | 1;
        else if ( str[j] == 'C' ) strcat( xtr, "10" ), four = (four << 2) | 2;
        else if ( str[j] == 'G' ) strcat( xtr, "11" ), four = (four << 2) | 3;

        if( ++j % 4 == 0 ) {
            printf( "%s - %3d\n", xtr, four );
            xtr[0] = '\0';
            four = 0;
        }
    }
ATTCGG
00010110 -  22
11110000 - 240

相关问题