assembly 为什么(或者为什么不)将氖内部数据类型作为输入/输出函数参数传递?

0yycz8jy  于 2022-11-13  发布在  其他
关注(0)|答案(1)|浏览(129)

这是我做的一个小测试。这里有两个场景:
1.**方案1:**两个函数(scenario1ascenario1b),其输入和输出为uint16_t*,并且加载到Neon数据类型(uint16x8x4_t)或从Neon数据类型(uint16x8x4_t)存储。
1.**方案2:**与方案1相同的函数(在本例中为scenario2ascenario2b),但输入和输出为uint16x8x4_t*,并且加载和存储在主函数中完成。
(下面的c代码包括了用-O3编译后生成的反汇编)。

#include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>

void scenario1a(uint16_t* resultArray, const uint16_t* X);
void scenario1b(uint16_t* resultArray, const uint16_t* X);

void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp);

void scenario1a(uint16_t* resultArray, const uint16_t* X) {
    uint16x8x4_t temp, result;

    temp = vld1q_u16_x4(X);

    result.val[0] = vextq_u16(temp.val[0], vmulq_n_u16(temp.val[1], -1), 2);
    result.val[1] = vextq_u16(temp.val[1], vmulq_n_u16(temp.val[2], -1), 2);
    result.val[2] = vextq_u16(temp.val[2], vmulq_n_u16(temp.val[3], -1), 2);
    result.val[3] = vextq_u16(temp.val[3], vmulq_n_u16(temp.val[0], -1), 2);

    vst1q_u16_x4(resultArray, result);
}

void scenario1b(uint16_t* resultArray, const uint16_t* X) {
    uint16x8x4_t temp, result;

    temp = vld1q_u16_x4(X);

    result.val[0] = vaddq_u16(temp.val[0], temp.val[1]);
    result.val[1] = vmulq_n_u16(temp.val[1], -2);
    result.val[2] = vaddq_u16(temp.val[2], temp.val[3]);
    result.val[3] = vmulq_n_u16(temp.val[3], -2);

    vst1q_u16_x4(resultArray, result);
}

void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp) {

    result->val[0] = vextq_u16(temp->val[0], vmulq_n_u16(temp->val[1], -1), 2);
    result->val[1] = vextq_u16(temp->val[1], vmulq_n_u16(temp->val[2], -1), 2);
    result->val[2] = vextq_u16(temp->val[2], vmulq_n_u16(temp->val[3], -1), 2);
    result->val[3] = vextq_u16(temp->val[3], vmulq_n_u16(temp->val[0], -1), 2);
}

void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp) {

    result->val[0] = vaddq_u16(temp->val[0], temp->val[1]);
    result->val[1] = vmulq_n_u16(temp->val[1], -2);
    result->val[2] = vaddq_u16(temp->val[2], temp->val[3]);
    result->val[3] = vmulq_n_u16(temp->val[3], -2);
}

int main(void) {

    uint16_t input[32] = {15,3,1,85,44,156,32,97,3,54,97,17,0,55,9,17,163,23,74,85,96,14,25,36,95,84,76,51,42,63,58,74};
    
    // Scenario 01: Input and output are uint16_t*
    uint16_t result01a[32];
    uint16_t result01_final[32];

    scenario1a(result01a, input);
    scenario1b(result01_final, result01a);

    // Scenario 02: Input and output are uint16x8x4_t

    uint16_t result02_final[32];
    uint16x8x4_t temp, result02a, result02b;

    temp = vld1q_u16_x4(input);

    scenario2a(&result02a, &temp);
    scenario2b(&result02b, &result02a);

    vst1q_u16_x4(result02_final, result02b);

    return 0;
}

拆卸:

test:     file format elf64-littleaarch64

Disassembly of section .init:

0000000000000658 <_init>:
 658:   a9bf7bfd    stp x29, x30, [sp, #-16]!
 65c:   910003fd    mov x29, sp
 660:   94000065    bl  7f4 <call_weak_fn>
 664:   a8c17bfd    ldp x29, x30, [sp], #16
 668:   d65f03c0    ret

Disassembly of section .plt:

0000000000000670 <.plt>:
 670:   a9bf7bf0    stp x16, x30, [sp, #-16]!
 674:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 678:   f947c611    ldr x17, [x16, #3976]
 67c:   913e2210    add x16, x16, #0xf88
 680:   d61f0220    br  x17
 684:   d503201f    nop
 688:   d503201f    nop
 68c:   d503201f    nop

0000000000000690 <__cxa_finalize@plt>:
 690:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 694:   f947ca11    ldr x17, [x16, #3984]
 698:   913e4210    add x16, x16, #0xf90
 69c:   d61f0220    br  x17

00000000000006a0 <__libc_start_main@plt>:
 6a0:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 6a4:   f947ce11    ldr x17, [x16, #3992]
 6a8:   913e6210    add x16, x16, #0xf98
 6ac:   d61f0220    br  x17

00000000000006b0 <__stack_chk_fail@plt>:
 6b0:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 6b4:   f947d211    ldr x17, [x16, #4000]
 6b8:   913e8210    add x16, x16, #0xfa0
 6bc:   d61f0220    br  x17

00000000000006c0 <__gmon_start__@plt>:
 6c0:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 6c4:   f947d611    ldr x17, [x16, #4008]
 6c8:   913ea210    add x16, x16, #0xfa8
 6cc:   d61f0220    br  x17

00000000000006d0 <abort@plt>:
 6d0:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 6d4:   f947da11    ldr x17, [x16, #4016]
 6d8:   913ec210    add x16, x16, #0xfb0
 6dc:   d61f0220    br  x17

Disassembly of section .text:

00000000000006e0 <main>:
 6e0:   90000085    adrp    x5, 10000 <__FRAME_END__+0xf3d8>
 6e4:   a9a67bfd    stp x29, x30, [sp, #-416]!
 6e8:   910003fd    mov x29, sp
 6ec:   90000002    adrp    x2, 0 <_init-0x658>
 6f0:   91292042    add x2, x2, #0xa48
 6f4:   910263e3    add x3, sp, #0x98
 6f8:   910363e0    add x0, sp, #0xd8
 6fc:   6f008434    mvni    v20.8h, #0x1
 700:   f947f0a5    ldr x5, [x5, #4064]
 704:   aa0303e1    mov x1, x3
 708:   910143e4    add x4, sp, #0x50
 70c:   a940344c    ldp x12, x13, [x2]
 710:   a9412c4a    ldp x10, x11, [x2, #16]
 714:   f94000a6    ldr x6, [x5]
 718:   f900cfe6    str x6, [sp, #408]
 71c:   d2800006    mov x6, #0x0                    // #0
 720:   a9422448    ldp x8, x9, [x2, #32]
 724:   a9431c46    ldp x6, x7, [x2, #48]
 728:   910463e2    add x2, sp, #0x118
 72c:   a909b7ec    stp x12, x13, [sp, #152]
 730:   a90aafea    stp x10, x11, [sp, #168]
 734:   a90ba7e8    stp x8, x9, [sp, #184]
 738:   a90c9fe6    stp x6, x7, [sp, #200]
 73c:   94000069    bl  8e0 <scenario1a>
 740:   4c402400    ld1 {v0.8h-v3.8h}, [x0]
 744:   910043e1    add x1, sp, #0x10
 748:   aa0403e0    mov x0, x4
 74c:   4c402470    ld1 {v16.8h-v19.8h}, [x3]
 750:   4e619e85    mul v5.8h, v20.8h, v1.8h
 754:   4e608424    add v4.8h, v1.8h, v0.8h
 758:   4e628466    add v6.8h, v3.8h, v2.8h
 75c:   4e639e87    mul v7.8h, v20.8h, v3.8h
 760:   4c002030    st1 {v16.16b-v19.16b}, [x1]
 764:   4c002444    st1 {v4.8h-v7.8h}, [x2]
 768:   94000072    bl  930 <scenario2a>
 76c:   ad409885    ldp q5, q6, [x4, #16]
 770:   90000081    adrp    x1, 10000 <__FRAME_END__+0xf3d8>
 774:   910563e2    add x2, sp, #0x158
 778:   3dc00c84    ldr q4, [x4, #48]
 77c:   3dc017e7    ldr q7, [sp, #80]
 780:   f947f021    ldr x1, [x1, #4064]
 784:   4e749c83    mul v3.8h, v4.8h, v20.8h
 788:   4e668482    add v2.8h, v4.8h, v6.8h
 78c:   4e749ca1    mul v1.8h, v5.8h, v20.8h
 790:   4e6784a0    add v0.8h, v5.8h, v7.8h
 794:   4c002440    st1 {v0.8h-v3.8h}, [x2]
 798:   f940cfe0    ldr x0, [sp, #408]
 79c:   f9400022    ldr x2, [x1]
 7a0:   eb020000    subs    x0, x0, x2
 7a4:   d2800002    mov x2, #0x0                    // #0
 7a8:   54000081    b.ne    7b8 <main+0xd8>  // b.any
 7ac:   52800000    mov w0, #0x0                    // #0
 7b0:   a8da7bfd    ldp x29, x30, [sp], #416
 7b4:   d65f03c0    ret
 7b8:   97ffffbe    bl  6b0 <__stack_chk_fail@plt>

00000000000007bc <_start>:
 7bc:   d280001d    mov x29, #0x0                       // #0
 7c0:   d280001e    mov x30, #0x0                       // #0
 7c4:   aa0003e5    mov x5, x0
 7c8:   f94003e1    ldr x1, [sp]
 7cc:   910023e2    add x2, sp, #0x8
 7d0:   910003e6    mov x6, sp
 7d4:   90000080    adrp    x0, 10000 <__FRAME_END__+0xf3d8>
 7d8:   f947f800    ldr x0, [x0, #4080]
 7dc:   90000083    adrp    x3, 10000 <__FRAME_END__+0xf3d8>
 7e0:   f947f463    ldr x3, [x3, #4072]
 7e4:   90000084    adrp    x4, 10000 <__FRAME_END__+0xf3d8>
 7e8:   f947e084    ldr x4, [x4, #4032]
 7ec:   97ffffad    bl  6a0 <__libc_start_main@plt>
 7f0:   97ffffb8    bl  6d0 <abort@plt>

00000000000007f4 <call_weak_fn>:
 7f4:   90000080    adrp    x0, 10000 <__FRAME_END__+0xf3d8>
 7f8:   f947ec00    ldr x0, [x0, #4056]
 7fc:   b4000040    cbz x0, 804 <call_weak_fn+0x10>
 800:   17ffffb0    b   6c0 <__gmon_start__@plt>
 804:   d65f03c0    ret
 808:   d503201f    nop
 80c:   d503201f    nop

0000000000000810 <deregister_tm_clones>:
 810:   b0000080    adrp    x0, 11000 <__data_start>
 814:   91004000    add x0, x0, #0x10
 818:   b0000081    adrp    x1, 11000 <__data_start>
 81c:   91004021    add x1, x1, #0x10
 820:   eb00003f    cmp x1, x0
 824:   540000c0    b.eq    83c <deregister_tm_clones+0x2c>  // b.none
 828:   90000081    adrp    x1, 10000 <__FRAME_END__+0xf3d8>
 82c:   f947e421    ldr x1, [x1, #4040]
 830:   b4000061    cbz x1, 83c <deregister_tm_clones+0x2c>
 834:   aa0103f0    mov x16, x1
 838:   d61f0200    br  x16
 83c:   d65f03c0    ret

0000000000000840 <register_tm_clones>:
 840:   b0000080    adrp    x0, 11000 <__data_start>
 844:   91004000    add x0, x0, #0x10
 848:   b0000081    adrp    x1, 11000 <__data_start>
 84c:   91004021    add x1, x1, #0x10
 850:   cb000021    sub x1, x1, x0
 854:   d37ffc22    lsr x2, x1, #63
 858:   8b810c41    add x1, x2, x1, asr #3
 85c:   9341fc21    asr x1, x1, #1
 860:   b40000c1    cbz x1, 878 <register_tm_clones+0x38>
 864:   90000082    adrp    x2, 10000 <__FRAME_END__+0xf3d8>
 868:   f947fc42    ldr x2, [x2, #4088]
 86c:   b4000062    cbz x2, 878 <register_tm_clones+0x38>
 870:   aa0203f0    mov x16, x2
 874:   d61f0200    br  x16
 878:   d65f03c0    ret
 87c:   d503201f    nop

0000000000000880 <__do_global_dtors_aux>:
 880:   a9be7bfd    stp x29, x30, [sp, #-32]!
 884:   910003fd    mov x29, sp
 888:   f9000bf3    str x19, [sp, #16]
 88c:   b0000093    adrp    x19, 11000 <__data_start>
 890:   39404260    ldrb    w0, [x19, #16]
 894:   35000140    cbnz    w0, 8bc <__do_global_dtors_aux+0x3c>
 898:   90000080    adrp    x0, 10000 <__FRAME_END__+0xf3d8>
 89c:   f947e800    ldr x0, [x0, #4048]
 8a0:   b4000080    cbz x0, 8b0 <__do_global_dtors_aux+0x30>
 8a4:   b0000080    adrp    x0, 11000 <__data_start>
 8a8:   f9400400    ldr x0, [x0, #8]
 8ac:   97ffff79    bl  690 <__cxa_finalize@plt>
 8b0:   97ffffd8    bl  810 <deregister_tm_clones>
 8b4:   52800020    mov w0, #0x1                    // #1
 8b8:   39004260    strb    w0, [x19, #16]
 8bc:   f9400bf3    ldr x19, [sp, #16]
 8c0:   a8c27bfd    ldp x29, x30, [sp], #32
 8c4:   d65f03c0    ret
 8c8:   d503201f    nop
 8cc:   d503201f    nop

00000000000008d0 <frame_dummy>:
 8d0:   17ffffdc    b   840 <register_tm_clones>
 8d4:   d503201f    nop
 8d8:   d503201f    nop
 8dc:   d503201f    nop

00000000000008e0 <scenario1a>:
 8e0:   4c402420    ld1 {v0.8h-v3.8h}, [x1]
 8e4:   6e60b833    neg v19.8h, v1.8h
 8e8:   6e60b852    neg v18.8h, v2.8h
 8ec:   6e60b871    neg v17.8h, v3.8h
 8f0:   6e60b810    neg v16.8h, v0.8h
 8f4:   6e132004    ext v4.16b, v0.16b, v19.16b, #4
 8f8:   6e122025    ext v5.16b, v1.16b, v18.16b, #4
 8fc:   6e112046    ext v6.16b, v2.16b, v17.16b, #4
 900:   6e102067    ext v7.16b, v3.16b, v16.16b, #4
 904:   4c002404    st1 {v4.8h-v7.8h}, [x0]
 908:   d65f03c0    ret
 90c:   d503201f    nop

0000000000000910 <scenario1b>:
 910:   4c402420    ld1 {v0.8h-v3.8h}, [x1]
 914:   6f008430    mvni    v16.8h, #0x1
 918:   4e619e05    mul v5.8h, v16.8h, v1.8h
 91c:   4e608424    add v4.8h, v1.8h, v0.8h
 920:   4e628466    add v6.8h, v3.8h, v2.8h
 924:   4e639e07    mul v7.8h, v16.8h, v3.8h
 928:   4c002404    st1 {v4.8h-v7.8h}, [x0]
 92c:   d65f03c0    ret

0000000000000930 <scenario2a>:
 930:   ad400025    ldp q5, q0, [x1]
 934:   ad408423    ldp q3, q1, [x1, #16]
 938:   3dc00c24    ldr q4, [x1, #48]
 93c:   6e60b800    neg v0.8h, v0.8h
 940:   4ea11c22    mov v2.16b, v1.16b
 944:   6e60b821    neg v1.8h, v1.8h
 948:   6e0020a5    ext v5.16b, v5.16b, v0.16b, #4
 94c:   4ea41c80    mov v0.16b, v4.16b
 950:   6e60b884    neg v4.8h, v4.8h
 954:   6e012063    ext v3.16b, v3.16b, v1.16b, #4
 958:   3d800005    str q5, [x0]
 95c:   3dc00021    ldr q1, [x1]
 960:   6e042042    ext v2.16b, v2.16b, v4.16b, #4
 964:   ad008803    stp q3, q2, [x0, #16]
 968:   6e60b821    neg v1.8h, v1.8h
 96c:   6e012000    ext v0.16b, v0.16b, v1.16b, #4
 970:   3d800c00    str q0, [x0, #48]
 974:   d65f03c0    ret
 978:   d503201f    nop
 97c:   d503201f    nop

0000000000000980 <scenario2b>:
 980:   ad401022    ldp q2, q4, [x1]
 984:   6f008420    mvni    v0.8h, #0x1
 988:   ad410c21    ldp q1, q3, [x1, #32]
 98c:   4e609c85    mul v5.8h, v4.8h, v0.8h
 990:   4e648442    add v2.8h, v2.8h, v4.8h
 994:   4e609c60    mul v0.8h, v3.8h, v0.8h
 998:   4e638421    add v1.8h, v1.8h, v3.8h
 99c:   ad001402    stp q2, q5, [x0]
 9a0:   ad010001    stp q1, q0, [x0, #32]
 9a4:   d65f03c0    ret

00000000000009a8 <__libc_csu_init>:
 9a8:   a9bc7bfd    stp x29, x30, [sp, #-64]!
 9ac:   910003fd    mov x29, sp
 9b0:   a90153f3    stp x19, x20, [sp, #16]
 9b4:   90000094    adrp    x20, 10000 <__FRAME_END__+0xf3d8>
 9b8:   9135c294    add x20, x20, #0xd70
 9bc:   a9025bf5    stp x21, x22, [sp, #32]
 9c0:   90000095    adrp    x21, 10000 <__FRAME_END__+0xf3d8>
 9c4:   9135a2b5    add x21, x21, #0xd68
 9c8:   cb150294    sub x20, x20, x21
 9cc:   2a0003f6    mov w22, w0
 9d0:   a90363f7    stp x23, x24, [sp, #48]
 9d4:   aa0103f7    mov x23, x1
 9d8:   aa0203f8    mov x24, x2
 9dc:   97ffff1f    bl  658 <_init>
 9e0:   eb940fff    cmp xzr, x20, asr #3
 9e4:   54000160    b.eq    a10 <__libc_csu_init+0x68>  // b.none
 9e8:   9343fe94    asr x20, x20, #3
 9ec:   d2800013    mov x19, #0x0                       // #0
 9f0:   f8737aa3    ldr x3, [x21, x19, lsl #3]
 9f4:   aa1803e2    mov x2, x24
 9f8:   91000673    add x19, x19, #0x1
 9fc:   aa1703e1    mov x1, x23
 a00:   2a1603e0    mov w0, w22
 a04:   d63f0060    blr x3
 a08:   eb13029f    cmp x20, x19
 a0c:   54ffff21    b.ne    9f0 <__libc_csu_init+0x48>  // b.any
 a10:   a94153f3    ldp x19, x20, [sp, #16]
 a14:   a9425bf5    ldp x21, x22, [sp, #32]
 a18:   a94363f7    ldp x23, x24, [sp, #48]
 a1c:   a8c47bfd    ldp x29, x30, [sp], #64
 a20:   d65f03c0    ret
 a24:   d503201f    nop

0000000000000a28 <__libc_csu_fini>:
 a28:   d65f03c0    ret

Disassembly of section .fini:

0000000000000a2c <_fini>:
 a2c:   a9bf7bfd    stp x29, x30, [sp, #-16]!
 a30:   910003fd    mov x29, sp
 a34:   a8c17bfd    ldp x29, x30, [sp], #16
 a38:   d65f03c0    ret

个问题

1.通常,人们从指针加载数据(使用vld1q_u16_x4),使用氖数据类型进行操作,然后存储回另一个指针(使用vst1q_u16_x4),而不使用我在场景2中使用的方法(将Neon数据类型作为输入/输出发送)。
1.我检查了场景1a的分解(从第8e0行开始)与方案2a(从第930行开始)。看起来场景2a的数据移动更多。所有场景都会发生这种情况吗?那么,我在问题1中问的问题是否更快?如果是,那么为什么场景1b与场景2b中不会发生这种情况(分别为910980行)。
1.在main函数中,Scenario 1a和Scenario 2a后面都有一些add/穆尔(在750,754,758,75c784,788,78c,790行),但是我的main函数没有乘法和加法。为什么会发生这种情况?(我只是好奇)
谢谢你的帮助!

fwzugrvs

fwzugrvs1#

1.绝对没有理由使用指向氖数据类型的指针作为参数。内存并不关心数据类型。编译器非常保守和官僚,他们只是不得不这样做。这就像向当局提交申请:一个错误的复选标记,您的应用程序将降落在错误的手,造成巨大的不必要的麻烦。
简短:保持简单。不要试图以任何方式给编译者或评论者留下深刻印象。
1.上次我告诉过你要明确内存加载和存储。在场景2中,你是直接从/向内存进行计算。永远不要这样做。坚持加载-〉计算-〉存储。局部变量是你最好的朋友。(__restrict指令可能会有帮助)
同样,不要试图给编译器或评审员留下深刻印象。你的场景2只是在自找麻烦。一场彻头彻尾的灾难。评审员会立即升起一个危险信号,并密切关注你和你所有的代码,如果你幸运的话,没有被立即解雇。
1.你不应该把被调用者和调用者放在同一个文件中。通常情况下,调用者会内联短的非静态被调用者,这会使分析变得更困难。

相关问题