assembly 理解JIT对for循环的重写

44u64gxh  于 2023-08-06  发布在  其他
关注(0)|答案(1)|浏览(74)

我有下面的Java代码(所有数组在我们调用“arrays”之前都被初始化,并且所有数组的大小都是“arraySize”)

int arraySize = 64;

float[] a;
float[] b;
float[] result;

public void arrays() {
    for (int i = 0; i < arraySize; i++) {
        result[i] = ((a[i] * b[i] + b[i] - b[i]) / b[i]) +
                     a[i] + a[i] + a[i] + a[i];
    }
}

字符串
JIT的输出为:

# {method} {0x00000001034751a8} 'arrays' '()V' in 'main/ComplexExpression'
#           [sp+0x30]  (sp of caller)
[Entry Point]
0x000000010c4c55a0: mov 0x8(%rsi),%r10d
0x000000010c4c55a4: movabs $0x800000000,%r11
0x000000010c4c55ae: add %r11,%r10
0x000000010c4c55b1: cmp %r10,%rax
0x000000010c4c55b4: jne 0x000000010c44b780  ;   {runtime_call ic_miss_stub}
0x000000010c4c55ba: xchg %ax,%ax
0x000000010c4c55bc: nopl 0x0(%rax)
[Verified Entry Point]
0x000000010c4c55c0: mov %eax,-0x14000(%rsp)
0x000000010c4c55c7: push %rbp
0x000000010c4c55c8: sub $0x20,%rsp  ;*synchronization entry
                                    ; - main.ComplexExpression::arrays@-1 (line 51)
0x000000010c4c55cc: mov %rsi,%rcx
0x000000010c4c55cf: mov 0xc(%rsi),%ebp  ;*getfield arraySize {reexecute=0 rethrow=0 return_oop=0}
                                        ; - main.ComplexExpression::arrays@4 (line 51)
0x000000010c4c55d2: test %ebp,%ebp
0x000000010c4c55d4: jle L0006  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                               ; - main.ComplexExpression::arrays@7 (line 51)
0x000000010c4c55da: mov 0x10(%rsi),%r11d  ;*getfield a {reexecute=0 rethrow=0 return_oop=0}
                                          ; - main.ComplexExpression::arrays@16 (line 52)
0x000000010c4c55de: xchg %ax,%ax
0x000000010c4c55e0: mov 0xc(%r12,%r11,8),%r10d  ; implicit exception: dispatches to 0x000000010c4c58a3
                                                ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c55e5: test %r10d,%r10d
0x000000010c4c55e8: jbe L0007
0x000000010c4c55ee: movslq %ebp,%r9
0x000000010c4c55f1: movslq %r10d,%r10
0x000000010c4c55f4: dec %r9
0x000000010c4c55f7: cmp %r10,%r9
0x000000010c4c55fa: nopw 0x0(%rax,%rax,1)
0x000000010c4c5600: jae L0007
0x000000010c4c5606: mov 0x14(%rsi),%ebx  ;*getfield b {reexecute=0 rethrow=0 return_oop=0}
                                         ; - main.ComplexExpression::arrays@22 (line 52)
0x000000010c4c5609: mov 0xc(%r12,%rbx,8),%r10d  ; implicit exception: dispatches to 0x000000010c4c58a3
                                                ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c560e: test %r10d,%r10d
0x000000010c4c5611: jbe L0007
0x000000010c4c5617: movslq %r10d,%r10
0x000000010c4c561a: nopw 0x0(%rax,%rax,1)
0x000000010c4c5620: cmp %r10,%r9
0x000000010c4c5623: jae L0007
0x000000010c4c5629: mov 0x18(%rsi),%r8d  ;*getfield result {reexecute=0 rethrow=0 return_oop=0}
                                         ; - main.ComplexExpression::arrays@11 (line 52)
0x000000010c4c562d: mov 0xc(%r12,%r8,8),%r10d  ; implicit exception: dispatches to 0x000000010c4c58a3
                                               ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                               ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c5632: test %r10d,%r10d
0x000000010c4c5635: jbe L0007
0x000000010c4c563b: movslq %r10d,%r10
0x000000010c4c563e: xchg %ax,%ax
0x000000010c4c5640: cmp %r10,%r9
0x000000010c4c5643: jae L0007
0x000000010c4c5649: lea (%r12,%r8,8),%rdx
0x000000010c4c564d: lea (%r12,%r11,8),%rdi
0x000000010c4c5651: mov %edx,%r11d
0x000000010c4c5654: lea (%r12,%rbx,8),%rax
0x000000010c4c5658: shr $0x2,%r11d
0x000000010c4c565c: and $0x7,%r11d
0x000000010c4c5660: mov $0x3,%r9d
0x000000010c4c5666: sub %r11d,%r9d
0x000000010c4c5669: and $0x7,%r9d
0x000000010c4c566d: inc %r9d
0x000000010c4c5670: cmp %ebp,%r9d
0x000000010c4c5673: cmovg %ebp,%r9d
0x000000010c4c5677: xor %r10d,%r10d
0x000000010c4c567a: xor %r11d,%r11d  ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                                     ; - main.ComplexExpression::arrays@10 (line 52)
             L0000: vmovss 0x10(%rax,%r11,4),%xmm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5684: vmovss 0x10(%rdi,%r11,4),%xmm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c568b: vmulss %xmm1,%xmm0,%xmm3
0x000000010c4c568f: vaddss %xmm1,%xmm3,%xmm2
0x000000010c4c5693: vsubss %xmm1,%xmm2,%xmm3
0x000000010c4c5697: vdivss %xmm1,%xmm3,%xmm1
0x000000010c4c569b: vaddss %xmm0,%xmm1,%xmm2
0x000000010c4c569f: vaddss %xmm0,%xmm2,%xmm1
0x000000010c4c56a3: vaddss %xmm0,%xmm1,%xmm2
0x000000010c4c56a7: vaddss %xmm0,%xmm2,%xmm0
0x000000010c4c56ab: vmovss %xmm0,0x10(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c56b2: inc %r11d  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
                               ; - main.ComplexExpression::arrays@78 (line 51)
0x000000010c4c56b5: cmp %r9d,%r11d
0x000000010c4c56b8: jl L0000  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                              ; - main.ComplexExpression::arrays@7 (line 51)
0x000000010c4c56ba: mov %ebp,%r9d
0x000000010c4c56bd: add $0xffffffe1,%r9d
0x000000010c4c56c1: mov $0x80000000,%r8d
0x000000010c4c56c7: cmp %r9d,%ebp
0x000000010c4c56ca: cmovl %r8d,%r9d
0x000000010c4c56ce: cmp %r9d,%r11d
0x000000010c4c56d1: jge L0004
0x000000010c4c56d7: mov $0x7d00,%ebx
             L0001: mov %r9d,%esi
0x000000010c4c56df: sub %r11d,%esi
0x000000010c4c56e2: cmp %r11d,%r9d
0x000000010c4c56e5: cmovl %r10d,%esi
0x000000010c4c56e9: cmp $0x7d00,%esi
0x000000010c4c56ef: cmova %ebx,%esi
0x000000010c4c56f2: add %r11d,%esi
0x000000010c4c56f5: data16 data16 nopw 0x0(%rax,%rax,1)  ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                                                         ; - main.ComplexExpression::arrays@10 (line 52)
             L0002: vmovdqu 0x10(%rax,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5707: vmovdqu 0x10(%rdi,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c570e: vmulps %ymm0,%ymm1,%ymm2
0x000000010c4c5712: vaddps %ymm0,%ymm2,%ymm2
0x000000010c4c5716: vsubps %ymm0,%ymm2,%ymm2
0x000000010c4c571a: vdivps %ymm0,%ymm2,%ymm0
0x000000010c4c571e: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c5722: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c5726: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c572a: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c572e: vmovdqu %ymm0,0x10(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c5735: vmovdqu 0x30(%rdi,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c573c: vmovdqu 0x30(%rax,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5743: vmulps %ymm1,%ymm0,%ymm2
0x000000010c4c5747: vaddps %ymm1,%ymm2,%ymm2
0x000000010c4c574b: vsubps %ymm1,%ymm2,%ymm2
0x000000010c4c574f: vdivps %ymm1,%ymm2,%ymm1
0x000000010c4c5753: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c5757: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c575b: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c575f: vaddps %ymm0,%ymm1,%ymm0
0x000000010c4c5763: vmovdqu %ymm0,0x30(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c576a: vmovdqu 0x50(%rdi,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c5771: vmovdqu 0x50(%rax,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5778: vmulps %ymm1,%ymm0,%ymm2
0x000000010c4c577c: vaddps %ymm1,%ymm2,%ymm2
0x000000010c4c5780: vsubps %ymm1,%ymm2,%ymm2
0x000000010c4c5784: vdivps %ymm1,%ymm2,%ymm1
0x000000010c4c5788: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c578c: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c5790: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c5794: vaddps %ymm0,%ymm1,%ymm0
0x000000010c4c5798: vmovdqu %ymm0,0x50(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c579f: vmovdqu 0x70(%rdi,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c57a6: vmovdqu 0x70(%rax,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c57ad: vmulps %ymm1,%ymm0,%ymm2
0x000000010c4c57b1: vaddps %ymm1,%ymm2,%ymm2
0x000000010c4c57b5: vsubps %ymm1,%ymm2,%ymm2
0x000000010c4c57b9: vdivps %ymm1,%ymm2,%ymm1
0x000000010c4c57bd: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c57c1: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c57c5: vaddps %ymm0,%ymm1,%ymm1
0x000000010c4c57c9: vaddps %ymm0,%ymm1,%ymm0
0x000000010c4c57cd: vmovdqu %ymm0,0x70(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c57d4: add $0x20,%r11d  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
                                     ; - main.ComplexExpression::arrays@78 (line 51)
0x000000010c4c57d8: cmp %esi,%r11d
0x000000010c4c57db: nopl 0x0(%rax,%rax,1)
0x000000010c4c57e0: jl L0002  ;*goto {reexecute=0 rethrow=0 return_oop=0}
                              ; - main.ComplexExpression::arrays@81 (line 51)
0x000000010c4c57e6: mov 0x348(%r15),%rsi  ; ImmutableOopMap {rcx=Oop rdi=Oop rdx=Oop rax=Oop }
                                          ;*goto {reexecute=1 rethrow=0 return_oop=0}
                                          ; - (reexecute) main.ComplexExpression::arrays@81 (line 51)
0x000000010c4c57ed: test %eax,(%rsi)  ;*goto {reexecute=0 rethrow=0 return_oop=0}
                                      ; - main.ComplexExpression::arrays@81 (line 51)
                                      ;   {poll} *** SAFEPOINT POLL ***
0x000000010c4c57ef: cmp %r9d,%r11d
0x000000010c4c57f2: jl L0001
0x000000010c4c57f8: mov %ebp,%r10d
0x000000010c4c57fb: add $0xfffffff9,%r10d
0x000000010c4c57ff: cmp %r10d,%ebp
0x000000010c4c5802: cmovl %r8d,%r10d
0x000000010c4c5806: cmp %r10d,%r11d
0x000000010c4c5809: jge L0004
0x000000010c4c580b: nop  ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                         ; - main.ComplexExpression::arrays@10 (line 52)
             L0003: vmovdqu 0x10(%rax,%r11,4),%ymm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5813: vmovdqu 0x10(%rdi,%r11,4),%ymm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c581a: vmulps %ymm0,%ymm1,%ymm2
0x000000010c4c581e: vaddps %ymm0,%ymm2,%ymm2
0x000000010c4c5822: vsubps %ymm0,%ymm2,%ymm2
0x000000010c4c5826: vdivps %ymm0,%ymm2,%ymm0
0x000000010c4c582a: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c582e: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c5832: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c5836: vaddps %ymm1,%ymm0,%ymm0
0x000000010c4c583a: vmovdqu %ymm0,0x10(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                     ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c5841: add $0x8,%r11d  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
                                    ; - main.ComplexExpression::arrays@78 (line 51)
0x000000010c4c5845: cmp %r10d,%r11d
0x000000010c4c5848: jl L0003  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                              ; - main.ComplexExpression::arrays@7 (line 51)
             L0004: cmp %ebp,%r11d
0x000000010c4c584d: jge L0006
0x000000010c4c584f: nop  ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                         ; - main.ComplexExpression::arrays@10 (line 52)
             L0005: vmovss 0x10(%rax,%r11,4),%xmm1  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@26 (line 52)
0x000000010c4c5857: vmovss 0x10(%rdi,%r11,4),%xmm0  ;*faload {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@20 (line 52)
0x000000010c4c585e: vmulss %xmm1,%xmm0,%xmm3
0x000000010c4c5862: vaddss %xmm1,%xmm3,%xmm2
0x000000010c4c5866: vsubss %xmm1,%xmm2,%xmm3
0x000000010c4c586a: vdivss %xmm1,%xmm3,%xmm1
0x000000010c4c586e: vaddss %xmm0,%xmm1,%xmm2
0x000000010c4c5872: vaddss %xmm0,%xmm2,%xmm1
0x000000010c4c5876: vaddss %xmm0,%xmm1,%xmm2
0x000000010c4c587a: vaddss %xmm0,%xmm2,%xmm0
0x000000010c4c587e: vmovss %xmm0,0x10(%rdx,%r11,4)  ;*fastore {reexecute=0 rethrow=0 return_oop=0}
                                                    ; - main.ComplexExpression::arrays@77 (line 52)
0x000000010c4c5885: inc %r11d  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
                               ; - main.ComplexExpression::arrays@78 (line 51)
0x000000010c4c5888: cmp %ebp,%r11d
0x000000010c4c588b: jl L0005
             L0006: vzeroupper
0x000000010c4c5890: add $0x20,%rsp
0x000000010c4c5894: pop %rbp
0x000000010c4c5895: cmp 0x340(%r15),%rsp  ;   {poll_return} *** SAFEPOINT POLL ***
0x000000010c4c589c: ja L0008
0x000000010c4c58a2: ret  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                         ; - main.ComplexExpression::arrays@7 (line 51)
             L0007: mov $0xffffff76,%esi
0x000000010c4c58a8: mov %rcx,(%rsp)
0x000000010c4c58ac: vzeroupper
0x000000010c4c58af: call 0x000000010c451000  ; ImmutableOopMap {[0]=Oop }
                                             ;*if_icmpge {reexecute=1 rethrow=0 return_oop=0}
                                             ; - (reexecute) main.ComplexExpression::arrays@7 (line 51)
                                             ;   {runtime_call UncommonTrapBlob}
             L0008: movabs $0x10c4c5895,%r10  ;   {internal_word}
0x000000010c4c58be: mov %r10,0x358(%r15)
0x000000010c4c58c5: jmp 0x000000010c452100  ;   {runtime_call SafepointBlob}
0x000000010c4c58ca: hlt
0x000000010c4c58cb: hlt
0x000000010c4c58cc: hlt
0x000000010c4c58cd: hlt
0x000000010c4c58ce: hlt
0x000000010c4c58cf: hlt
0x000000010c4c58d0: hlt
0x000000010c4c58d1: hlt
0x000000010c4c58d2: hlt
0x000000010c4c58d3: hlt
0x000000010c4c58d4: hlt
0x000000010c4c58d5: hlt
0x000000010c4c58d6: hlt
0x000000010c4c58d7: hlt
0x000000010c4c58d8: hlt
0x000000010c4c58d9: hlt
0x000000010c4c58da: hlt
0x000000010c4c58db: hlt
0x000000010c4c58dc: hlt
0x000000010c4c58dd: hlt
0x000000010c4c58de: hlt
0x000000010c4c58df: hlt
[Exception Handler]
0x000000010c4c58e0: jmp 0x000000010c464a00  ;   {no_reloc}
[Deopt Handler Code]
0x000000010c4c58e5: call 0x000000010c4c58ea
0x000000010c4c58ea: subq $0x5,(%rsp)
0x000000010c4c58ef: jmp 0x000000010c4513a0  ;   {runtime_call DeoptimizationBlob}
0x000000010c4c58f4: hlt
0x000000010c4c58f5: hlt
0x000000010c4c58f6: hlt
0x000000010c4c58f7: hlt


似乎for循环被翻译成了多个“汇编循环”。主要部分在L0002中,我猜,在那里你可以看到循环展开,但也有L0000,L0003和L0005,它们似乎也是for循环的一部分,但我很难理解它们是如何适应的。
谁能解释一下所有标签中的所有部分是如何组成实际循环的?我想这是JIT中的一些“已知”模式,但我不知道。

xwbd5t1u

xwbd5t1u1#

只是讨论跳转标签所指示的主循环。
L0000循环:对齐8个元素的倍数(32字节)。and $0x7是一个死的赠品。当涉及到向量指令时,几乎到处都在讨论向量对齐。我没有一个金子弹参考链接,解释这个主题,但你会发现SO和其他地方的大量资源。Intel® 64 and IA-32 Architectures Optimization Reference Manual也应该涵盖它。
即使没有,最有效的内存访问应该是数据不跨缓存行或更糟的内存页拆分的访问,这一点应该是不言而喻的。与自然矢量大小对齐可确保。
AVX和AVX 2硬件通常可以很好地处理未对齐问题,但older SSEnewer AVX512 hardware需要对齐才能获得最佳性能。我假设代码生成器只是处理所有这些情况相同。
因为我们有两个输入和一个输出,所以我们只能真正与其中一个对齐,然后希望其他的也同样对齐。如果我正确地读取了代码,则会在输出上执行对齐,而不是在输入上执行对齐,但我可能会将其读取错误。
L0002循环:主循环展开4次,每次迭代4 x 8 = 32个元素。
L0003循环:与主循环相同,但未展开以处理最后0-3个完整向量。
L0005循环:将最后0-7个元素作为标量处理。有趣的是,使用16字节XMM寄存器进行一次迭代可以节省4次迭代,但编译器选择不这样做。我猜他们觉得不值得。在L0000循环中也可以执行相同的操作。
编译器知道输出与输入没有重叠。否则,您可能会期望另一个单独的循环来处理两个对象重叠得如此紧密以至于无法进行矢量化的情况。这可能只是到L0005循环的一个单独的条件跳转。

校准循环计数器

让我们试着解开L0000的对齐环。它的循环计数器是%r11d(在循环前立即设置为0),%r9d作为它的循环限制。让我们回溯%r9d

0x000000010c4c5670: cmp %ebp,%r9d
0x000000010c4c5673: cmovg %ebp,%r9d

字符串
我们之前已经证明了%ebp包含arraySize,所以基本上就是r9d = min(r9d, arraySize)。有道理。在未对齐的小数组中,L0000循环处理全部内容。
在此之前:

0x000000010c4c5649: lea (%r12,%r8,8),%rdx
0x000000010c4c5651: mov %edx,%r11d
0x000000010c4c5658: shr $0x2,%r11d
0x000000010c4c565c: and $0x7,%r11d
0x000000010c4c5660: mov $0x3,%r9d
0x000000010c4c5666: sub %r11d,%r9d
0x000000010c4c5669: and $0x7,%r9d
0x000000010c4c566d: inc %r9d


r12从未在该代码段中初始化。This question建议它保存堆基。再往上,我们可以找到0x000000010c4c5629: mov 0x18(%rsi),%r8d ;*getfield result这条直线。我们假设lea (%r12,%r8,8),%rdx给出了结果数组的起始地址。
但是,该地址似乎不是数据内容的开始。在循环的后面,您会发现所有内存访问都遵循vmovss %xmm0,0x10(%rdx,%r11,4)模式。请注意附加的偏移0x10 = 16。我们可以猜测,数组的前16个字节包含诸如大小之类的元数据。此偏移量会合并到每个后续计算中。我不知道
shr $0x2除以4,从字节到浮点索引。and $0x7给出了与8的最后一个倍数的偏移。在C代码中,这将是

struct Array {
   int64_t meta1, meta2;
   float content[];
};
Array* result = heap_base + offset_of_result;
unsigned r11d = (unsigned) result / sizeof(float) % 8;


然后我们计算%r9d

unsigned r9d = (3 - r11d) % 8 + 1;


我们已经确定%r11d在0到7之间,所以我们可以只看数字。

| r11d |  r9d |
|   0  |   4  |
|   1  |   3  |
|   2  |   2  |
|   3  |   1  |
|   4  |   8  |
|   5  |   7  |
|   6  |   6  |
|   7  |   5  |


只有当我们记住所有的内存访问都被4个额外的浮点数偏移时,这才有意义。假设%r11d为1,表示数组头在最后32字节对齐之后开始1个浮点数= 4字节。然后,数组内容在对齐后从1 + 4 = 5浮点数= 20字节开始。然后,我们进行3次标量迭代,以获得下一个32字节对齐。
我在这段代码中遇到的一个问题是,对于r11d = 4,我们不应该进行任何标量迭代。我们可以直接进入矢量化代码。一个额外的and $0x7,r9d可以解决这个问题。但另一方面,由于循环总是至少执行一次,因此这节省了条件分支指令。

后续问题

因此,“循环对齐”不是用于循环控制的“i”的对齐,而是与高速缓存行大小的对齐?
不是缓存行大小,是自然矢量大小。此代码使用AVX向量(YMM寄存器)。它们的大小是32字节,因此我们尝试获得32字节对齐。高速缓存行将是64字节。但是请注意,32字节向量的32字节对齐访问如何从不跨越缓存线边界。这是我们的主要目标
如果是的话,你知道确定迭代次数的伪算法吗?我会假设只是(地址%8),但在L0000之前的代码中,似乎只有“莱亚”没有找到进入r9 d或r11 d的方法,而r9 d或r11 d用于循环控制
我想我已经在上面说得很清楚了。不过,一般而言,如果用C语言编写,则可以编写:

float* array = ...;
size_t array_size = ...;
const size_t vector_in_bytes = 32;
size_t start_in_bytes = (size_t) array;
size_t offset_from_alignment = start_in_bytes % vector_in_bytes;
size_t bytes_to_alignment = vector_in_bytes - offset_from_alignment;
// if offset_from_alignment is 0, make bytes_to_alignment 0 instead of 32
bytes_to_alignment %= vector_in_bytes;
size_t elements_to_alignment = min(array_size, bytes_to_alignment / sizeof(float));
size_t i; // loop counter will be carried through all 3 loops
for(i = 0; i < elements_to_alignment; ++i) {
    // scalar loop for alignment
}
size_t elements_per_vector = vector_in_bytes / sizeof(float);
for(; i + elements_per_vector <= array_size; i += elements_per_vector) {
    // vectorized loop
}
for(;  i < array_size; ++i) {
    // scalar loop for last few elements
}


写这篇文章有几个较短的版本,但这应该是最容易理解的。需要注意的关键一点是,我们必须将数组指针重新解释为整数,这就是我使用C而不是Java的原因。
一个更短、更优雅的版本可以

float* array = ...;
size_t array_size = ...;
const size_t vector_in_bytes = 32;
size_t start_in_bytes = (size_t) array;
size_t next_aligned = (start_in_bytes + vector_in_bytes - 1) & -vector_in_bytes;
size_t elements_to_alignment = min(array_size,
      (next_aligned - start_in_bytes) / sizeof(float));
size_t i;
for(i = 0; i < elements_to_alignment; ++i) {
   ...
}

相关问题