当元素数量少于3-5百万时,numpy是如何获得速度的?

u7up0aaq  于 12个月前  发布在  其他
关注(0)|答案(1)|浏览(124)

一些numpy操作在元素数量下降到一定数量以下时会突然提高性能。
这是一个使用numpy来查找数组的最大值的函数。

def np_max(arr):
    return np.max(arr)

字符串
为了比较,这里有一个函数,它也使用numba来查找最大值。

@numba.njit
def nb_max(arr):
    a_max = arr[0]
    for a in arr[1:]:
        a_max = max(a_max, a)
    return a_max


这是用perfplot测量的运行时间。
代码:

def setup(n):
    rng = np.random.default_rng(0)
    return rng.random(n)

def benchmark_timeit():
    for f in [np_max, nb_max]:
        f(setup(100))

    n_run = 100
    for n in np.logspace(6, 7, num=30, dtype=int).tolist():
        arr = setup(n)
        np_max_time = timeit(lambda: np_max(arr), number=n_run) / n_run
        nb_max_time = timeit(lambda: nb_max(arr), number=n_run) / n_run
        print(f"n={n,}:"
              f" np_max={np_max_time * 1000:.2f}"
              f", nb_max={nb_max_time * 1000:.2f}"
              f", np/nb={np_max_time / nb_max_time:.2f}")

def benchmark_perfplot():
    for f in [np_max, nb_max]:
        f(setup(100))

    data = perfplot.bench(
        n_range=np.logspace(1, 8, num=8 + 7 * 9, dtype=int).tolist(),
        # n_range=np.logspace(6, 7, num=30, dtype=int).tolist(),
        setup=setup,
        kernels=[np_max, nb_max],
        equality_check=np.allclose,
        target_time_per_measurement=1.0,
    )
    data.save("./temp2.png")

if __name__ == "__main__":
    benchmark_perfplot()
    benchmark_timeit()


测试结果:


的数据
正如你所看到的,在10^6和10^7之间有一个很大的跳跃。用时间测量的结果也证实了这一点。

n=(1000000,): np_max=0.11, nb_max=0.27, np/nb=0.42 <-- np/nb = numpy-runtime / numba-runtime
n=(1082636,): np_max=0.12, nb_max=0.30, np/nb=0.40
n=(1172102,): np_max=0.13, nb_max=0.32, np/nb=0.40
n=(1268961,): np_max=0.15, nb_max=0.46, np/nb=0.31
n=(1373823,): np_max=0.16, nb_max=0.38, np/nb=0.41
n=(1487352,): np_max=0.17, nb_max=0.41, np/nb=0.41
n=(1610262,): np_max=0.18, nb_max=0.45, np/nb=0.41
n=(1743328,): np_max=0.19, nb_max=0.48, np/nb=0.40
n=(1887391,): np_max=0.21, nb_max=0.52, np/nb=0.41
n=(2043359,): np_max=0.23, nb_max=0.56, np/nb=0.41
n=(2212216,): np_max=0.25, nb_max=0.61, np/nb=0.41
n=(2395026,): np_max=0.30, nb_max=0.69, np/nb=0.44
n=(2592943,): np_max=0.31, nb_max=0.72, np/nb=0.42
n=(2807216,): np_max=0.33, nb_max=0.77, np/nb=0.43
n=(3039195,): np_max=0.38, nb_max=0.88, np/nb=0.43 <-- 0.4
n=(3290344,): np_max=0.50, nb_max=0.97, np/nb=0.51      |
n=(3562247,): np_max=0.65, nb_max=1.07, np/nb=0.61      |
n=(3856620,): np_max=0.80, nb_max=1.19, np/nb=0.67      | 2x difference
n=(4175318,): np_max=1.00, nb_max=1.35, np/nb=0.74      |
n=(4520353,): np_max=1.11, nb_max=1.49, np/nb=0.75      |
n=(4893900,): np_max=1.25, nb_max=1.59, np/nb=0.78      |
n=(5298316,): np_max=1.44, nb_max=1.79, np/nb=0.81 <-- 0.8
n=(5736152,): np_max=1.57, nb_max=1.95, np/nb=0.80
n=(6210169,): np_max=1.71, nb_max=2.08, np/nb=0.82
n=(6723357,): np_max=1.85, nb_max=2.27, np/nb=0.81
n=(7278953,): np_max=2.02, nb_max=2.49, np/nb=0.81
n=(7880462,): np_max=2.17, nb_max=2.67, np/nb=0.81
n=(8531678,): np_max=2.44, nb_max=2.91, np/nb=0.84
n=(9236708,): np_max=2.61, nb_max=3.17, np/nb=0.82
n=(10000000,): np_max=2.81, nb_max=3.50, np/nb=0.80


我的问题是:
1.什么样的优化会带来如此大的差异?
1.有没有可能在伦巴中重现它?
请注意,我想要完成的是获得关于这种优化的知识,而不是用像并行化这样的开箱即用的方法来击败numpy。
规格:

  • AMD锐龙9 5900X
  • RAM 64 GB
  • Windows 10
  • Python 3.10.11
  • numpy 1.26.2
  • numba 0.58.1

这里是LLVM IR和汇编代码,n=10**6,其中与numpy的性能差异仍然很大。

def inspect():
    n_run = 100

    arr = setup(10 ** 6)
    nb_max(arr)
    print(timeit(lambda: nb_max(arr), number=n_run) / n_run)

    t = nb_max.inspect_asm()
    assert len(t) == 1
    Path("inspect_asm_10-6.txt").write_text(t[list(t)[0]])

    t = nb_max.inspect_llvm()
    assert len(t) == 1
    Path("inspect_llvm_10-6.txt").write_text(t[list(t)[0]])

if __name__ == "__main__":
    # benchmark_perfplot()
    # benchmark_timeit()
    inspect()


inspect_asm_10-6.txt

.text
    .file   "<string>"
    .globl  _ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
    .p2align    4, 0x90
    .type   _ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,@function
_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE:
    pushq   %r14
    pushq   %rsi
    pushq   %rdi
    pushq   %rbx
    movq    96(%rsp), %rax
    movq    88(%rsp), %r9
    movl    $1, %r8d
    cmpq    $2, %rax
    jl  .LBB0_10
    vmovsd  (%r9), %xmm0
    subq    %r8, %rax
    testq   %rax, %rax
    jle .LBB0_9
.LBB0_2:
    movq    104(%rsp), %r10
    movq    %rax, %rdx
    sarq    $63, %rdx
    andnq   %rax, %rdx, %r11
    movl    %r11d, %edx
    leaq    -1(%r11), %rax
    andl    $7, %edx
    cmpq    $7, %rax
    jae .LBB0_4
    xorl    %eax, %eax
    jmp .LBB0_6
.LBB0_4:
    movabsq $9223372036854775800, %rax
    leaq    (%r9,%r8,8), %rdi
    leaq    (%r10,%r10,2), %r14
    andq    %rax, %r11
    xorl    %eax, %eax
    .p2align    4, 0x90
.LBB0_5:
    vmovsd  (%rdi), %xmm1
    vmovsd  (%r10,%rdi), %xmm2
    vmovsd  (%rdi,%r10,2), %xmm3
    leaq    (%r14,%rdi), %rbx
    addq    $8, %rax
    vmaxsd  %xmm0, %xmm1, %xmm0
    vmaxsd  %xmm0, %xmm2, %xmm0
    vmovsd  (%r14,%rdi), %xmm2
    leaq    (%r10,%rbx), %rdi
    vmaxsd  %xmm0, %xmm3, %xmm0
    vmovsd  (%r10,%rbx), %xmm3
    leaq    (%r10,%rdi), %rbx
    leaq    (%r10,%rbx), %rsi
    vmaxsd  %xmm0, %xmm2, %xmm0
    vmovsd  (%r10,%rdi), %xmm2
    leaq    (%r10,%rsi), %rdi
    vmaxsd  %xmm0, %xmm3, %xmm0
    vmovsd  (%r10,%rbx), %xmm3
    addq    %r10, %rdi
    vmaxsd  %xmm0, %xmm2, %xmm0
    vmovsd  (%r10,%rsi), %xmm2
    vmaxsd  %xmm0, %xmm3, %xmm0
    vmaxsd  %xmm0, %xmm2, %xmm0
    cmpq    %rax, %r11
    jne .LBB0_5
.LBB0_6:
    testq   %rdx, %rdx
    je  .LBB0_9
    imulq   %r10, %rax
    addq    %rax, %r9
    leaq    (%r9,%r8,8), %rax
    .p2align    4, 0x90
.LBB0_8:
    vmovsd  (%rax), %xmm1
    addq    %r10, %rax
    decq    %rdx
    vmaxsd  %xmm0, %xmm1, %xmm0
    jne .LBB0_8
.LBB0_9:
    vmovsd  %xmm0, (%rcx)
    xorl    %eax, %eax
    popq    %rbx
    popq    %rdi
    popq    %rsi
    popq    %r14
    retq
.LBB0_10:
    movq    %rax, %r8
    vmovsd  (%r9), %xmm0
    subq    %r8, %rax
    testq   %rax, %rax
    jg  .LBB0_2
    jmp .LBB0_9
.Lfunc_end0:
    .size   _ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, .Lfunc_end0-_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE

    .globl  _ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
    .p2align    4, 0x90
    .type   _ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,@function
_ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register %rbp
    pushq   %rsi
    andq    $-32, %rsp
    subq    $224, %rsp
    vmovaps %xmm6, -32(%rbp)
    .cfi_offset %rsi, -24
    .cfi_offset %xmm6, -48
    leaq    88(%rsp), %rax
    movq    %rdx, %rcx
    movabsq $.const.nb_max, %rdx
    movl    $1, %r8d
    movl    $1, %r9d
    movq    %rax, 32(%rsp)
    movabsq $PyArg_UnpackTuple, %rax
    callq   *%rax
    testl   %eax, %eax
    je  .LBB1_1
    movabsq $_ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, %rax
    cmpq    $0, (%rax)
    je  .LBB1_4
    movq    88(%rsp), %rcx
    movabsq $NRT_adapt_ndarray_from_python, %rax
    leaq    96(%rsp), %rdx
    vxorps  %xmm0, %xmm0, %xmm0
    vmovaps %ymm0, 96(%rsp)
    vmovups %ymm0, 120(%rsp)
    vzeroupper
    callq   *%rax
    testl   %eax, %eax
    jne .LBB1_8
    cmpq    $8, 120(%rsp)
    jne .LBB1_8
    vmovaps 128(%rsp), %xmm0
    movq    144(%rsp), %rax
    movq    96(%rsp), %rsi
    leaq    80(%rsp), %rcx
    movq    $0, 80(%rsp)
    movq    %rax, 64(%rsp)
    movabsq $_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, %rax
    vmovups %xmm0, 48(%rsp)
    callq   *%rax
    vmovsd  80(%rsp), %xmm6
    movabsq $NRT_decref, %rax
    movq    %rsi, %rcx
    callq   *%rax
    movabsq $PyFloat_FromDouble, %rax
    vmovaps %xmm6, %xmm0
    callq   *%rax
.LBB1_2:
    vmovaps -32(%rbp), %xmm6
    leaq    -8(%rbp), %rsp
    popq    %rsi
    popq    %rbp
    retq
.LBB1_4:
    movabsq $PyExc_RuntimeError, %rcx
    movabsq $".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE", %rdx
    jmp .LBB1_5
.LBB1_8:
    movabsq $PyExc_TypeError, %rcx
    movabsq $".const.can't unbox array from PyObject into native value.  The object maybe of a different type", %rdx
.LBB1_5:
    movabsq $PyErr_SetString, %rax
    callq   *%rax
.LBB1_1:
    xorl    %eax, %eax
    jmp .LBB1_2
.Lfunc_end1:
    .size   _ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, .Lfunc_end1-_ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
    .cfi_endproc

    .globl  cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
    .p2align    4, 0x90
    .type   cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,@function
cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE:
    subq    $88, %rsp
    vmovaps 128(%rsp), %xmm0
    movq    144(%rsp), %rax
    leaq    80(%rsp), %rcx
    movq    $0, 80(%rsp)
    movq    %rax, 64(%rsp)
    movabsq $_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, %rax
    vmovups %xmm0, 48(%rsp)
    callq   *%rax
    vmovsd  80(%rsp), %xmm0
    addq    $88, %rsp
    retq
.Lfunc_end2:
    .size   cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, .Lfunc_end2-cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE

    .weak   NRT_decref
    .p2align    4, 0x90
    .type   NRT_decref,@function
NRT_decref:
    .cfi_startproc
    testq   %rcx, %rcx
    je  .LBB3_2
    #MEMBARRIER
    lock        decq    (%rcx)
    je  .LBB3_3
.LBB3_2:
    retq
.LBB3_3:
    movabsq $NRT_MemInfo_call_dtor, %rax
    #MEMBARRIER
    rex64 jmpq  *%rax
.Lfunc_end3:
    .size   NRT_decref, .Lfunc_end3-NRT_decref
    .cfi_endproc

    .type   .const.nb_max,@object
    .section    .rodata,"a",@progbits
.const.nb_max:
    .asciz  "nb_max"
    .size   .const.nb_max, 7

    .type   _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,@object
    .comm   _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,8,8
    .type   ".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE",@object
    .p2align    4
".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE":
    .asciz  "missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE"
    .size   ".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE", 128

    .type   ".const.can't unbox array from PyObject into native value.  The object maybe of a different type",@object
    .p2align    4
".const.can't unbox array from PyObject into native value.  The object maybe of a different type":
    .asciz  "can't unbox array from PyObject into native value.  The object maybe of a different type"
    .size   ".const.can't unbox array from PyObject into native value.  The object maybe of a different type", 89

    .section    ".note.GNU-stack","",@progbits


inspect_llvm_10-6.txt

; ModuleID = 'nb_max'
source_filename = "<string>"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc"

@.const.nb_max = internal constant [7 x i8] c"nb_max\00"
@_ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE = common local_unnamed_addr global i8* null
@".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE" = internal constant [128 x i8] c"missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE\00"
@PyExc_TypeError = external global i8
@".const.can't unbox array from PyObject into native value.  The object maybe of a different type" = internal constant [89 x i8] c"can't unbox array from PyObject into native value.  The object maybe of a different type\00"
@PyExc_RuntimeError = external global i8

; Function Attrs: nofree norecurse nosync nounwind
define i32 @_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE(double* noalias nocapture writeonly %retptr, { i8*, i32, i8*, i8*, i32 }** noalias nocapture readnone %excinfo, i8* nocapture readnone %arg.arr.0, i8* nocapture readnone %arg.arr.1, i64 %arg.arr.2, i64 %arg.arr.3, double* %arg.arr.4, i64 %arg.arr.5.0, i64 %arg.arr.6.0) local_unnamed_addr #0 {
B0.else.endif:
  %.44 = load double, double* %arg.arr.4, align 8
  %.93 = icmp slt i64 %arg.arr.5.0, 2
  br i1 %.93, label %B0.else.endif.if, label %B0.endif, !prof !0

B24:                                              ; preds = %B24, %B24.lr.ph.new
  %lsr.iv9 = phi i64 [ %8, %B24 ], [ %16, %B24.lr.ph.new ]
  %a_max.2.04 = phi double [ %.44, %B24.lr.ph.new ], [ %.321.7, %B24 ]
  %.224.03 = phi i64 [ 0, %B24.lr.ph.new ], [ %.294.7, %B24 ]
  %.290 = inttoptr i64 %lsr.iv9 to double*
  %.291 = load double, double* %.290, align 8
  %.320 = fcmp ogt double %.291, %a_max.2.04
  %.321 = select i1 %.320, double %.291, double %a_max.2.04
  %0 = add i64 %arg.arr.6.0, %lsr.iv9
  %.290.1 = inttoptr i64 %0 to double*
  %.291.1 = load double, double* %.290.1, align 8
  %.320.1 = fcmp ogt double %.291.1, %.321
  %.321.1 = select i1 %.320.1, double %.291.1, double %.321
  %sunkaddr = inttoptr i64 %lsr.iv9 to double*
  %sunkaddr11 = mul i64 %arg.arr.6.0, 2
  %1 = bitcast double* %sunkaddr to i8*
  %sunkaddr12 = getelementptr i8, i8* %1, i64 %sunkaddr11
  %2 = bitcast i8* %sunkaddr12 to double*
  %.291.2 = load double, double* %2, align 8
  %.320.2 = fcmp ogt double %.291.2, %.321.1
  %.321.2 = select i1 %.320.2, double %.291.2, double %.321.1
  %3 = add i64 %17, %lsr.iv9
  %.290.3 = inttoptr i64 %3 to double*
  %.291.3 = load double, double* %.290.3, align 8
  %.320.3 = fcmp ogt double %.291.3, %.321.2
  %.321.3 = select i1 %.320.3, double %.291.3, double %.321.2
  %4 = add i64 %arg.arr.6.0, %3
  %.290.4 = inttoptr i64 %4 to double*
  %.291.4 = load double, double* %.290.4, align 8
  %.320.4 = fcmp ogt double %.291.4, %.321.3
  %.321.4 = select i1 %.320.4, double %.291.4, double %.321.3
  %5 = add i64 %arg.arr.6.0, %4
  %.290.5 = inttoptr i64 %5 to double*
  %.291.5 = load double, double* %.290.5, align 8
  %.320.5 = fcmp ogt double %.291.5, %.321.4
  %.321.5 = select i1 %.320.5, double %.291.5, double %.321.4
  %6 = add i64 %arg.arr.6.0, %5
  %.290.6 = inttoptr i64 %6 to double*
  %.291.6 = load double, double* %.290.6, align 8
  %.320.6 = fcmp ogt double %.291.6, %.321.5
  %.321.6 = select i1 %.320.6, double %.291.6, double %.321.5
  %7 = add i64 %arg.arr.6.0, %6
  %.290.7 = inttoptr i64 %7 to double*
  %.291.7 = load double, double* %.290.7, align 8
  %.294.7 = add nuw i64 %.224.03, 8
  %.320.7 = fcmp ogt double %.291.7, %.321.6
  %.321.7 = select i1 %.320.7, double %.291.7, double %.321.6
  %niter.ncmp.7 = icmp eq i64 %unroll_iter, %.294.7
  %8 = add i64 %arg.arr.6.0, %7
  br i1 %niter.ncmp.7, label %B38.loopexit.unr-lcssa, label %B24

B38.loopexit.unr-lcssa:                           ; preds = %B24, %B24.lr.ph
  %.321.lcssa.ph = phi double [ undef, %B24.lr.ph ], [ %.321.7, %B24 ]
  %a_max.2.04.unr = phi double [ %.44, %B24.lr.ph ], [ %.321.7, %B24 ]
  %.224.03.unr = phi i64 [ 0, %B24.lr.ph ], [ %.294.7, %B24 ]
  %lcmp.mod.not = icmp eq i64 %xtraiter, 0
  br i1 %lcmp.mod.not, label %B38, label %B24.epil.preheader

B24.epil.preheader:                               ; preds = %B38.loopexit.unr-lcssa
  %9 = ptrtoint double* %arg.arr.4 to i64
  %10 = mul i64 %.224.03.unr, %arg.arr.6.0
  %11 = add i64 %9, %10
  %12 = shl i64 %.73.sroa.0.0, 3
  %13 = add i64 %11, %12
  br label %B24.epil

B24.epil:                                         ; preds = %B24.epil.preheader, %B24.epil
  %lsr.iv7 = phi i64 [ %xtraiter, %B24.epil.preheader ], [ %lsr.iv.next8, %B24.epil ]
  %lsr.iv = phi i64 [ %13, %B24.epil.preheader ], [ %lsr.iv.next, %B24.epil ]
  %a_max.2.04.epil = phi double [ %.321.epil, %B24.epil ], [ %a_max.2.04.unr, %B24.epil.preheader ]
  %.290.epil = inttoptr i64 %lsr.iv to double*
  %.291.epil = load double, double* %.290.epil, align 8
  %.320.epil = fcmp ogt double %.291.epil, %a_max.2.04.epil
  %.321.epil = select i1 %.320.epil, double %.291.epil, double %a_max.2.04.epil
  %lsr.iv.next = add i64 %lsr.iv, %arg.arr.6.0
  %lsr.iv.next8 = add nsw i64 %lsr.iv7, -1
  %epil.iter.cmp.not = icmp eq i64 %lsr.iv.next8, 0
  br i1 %epil.iter.cmp.not, label %B38, label %B24.epil, !llvm.loop !1

B38:                                              ; preds = %B24.epil, %B38.loopexit.unr-lcssa, %B0.endif
  %a_max.2.0.lcssa = phi double [ %.44, %B0.endif ], [ %.321.lcssa.ph, %B38.loopexit.unr-lcssa ], [ %.321.epil, %B24.epil ]
  store double %a_max.2.0.lcssa, double* %retptr, align 8
  ret i32 0

B0.endif:                                         ; preds = %B0.else.endif, %B0.else.endif.if
  %.73.sroa.0.0 = phi i64 [ %arg.arr.5.0, %B0.else.endif.if ], [ 1, %B0.else.endif ]
  %.161 = sub i64 %arg.arr.5.0, %.73.sroa.0.0
  %.168.inv = icmp sgt i64 %.161, 0
  %.170 = select i1 %.168.inv, i64 %.161, i64 0
  br i1 %.168.inv, label %B24.lr.ph, label %B38

B24.lr.ph:                                        ; preds = %B0.endif
  %.186 = getelementptr double, double* %arg.arr.4, i64 %.73.sroa.0.0
  %14 = add nsw i64 %.170, -1
  %xtraiter = and i64 %.170, 7
  %15 = icmp ult i64 %14, 7
  br i1 %15, label %B38.loopexit.unr-lcssa, label %B24.lr.ph.new

B24.lr.ph.new:                                    ; preds = %B24.lr.ph
  %16 = ptrtoint double* %.186 to i64
  %unroll_iter = and i64 %.170, 9223372036854775800
  %17 = mul i64 %arg.arr.6.0, 3
  br label %B24

B0.else.endif.if:                                 ; preds = %B0.else.endif
  br label %B0.endif
}

define i8* @_ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE(i8* nocapture readnone %py_closure, i8* %py_args, i8* nocapture readnone %py_kws) local_unnamed_addr {
entry:
  %.5 = alloca i8*, align 8
  %.6 = call i32 (i8*, i8*, i64, i64, ...) @PyArg_UnpackTuple(i8* %py_args, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.const.nb_max, i64 0, i64 0), i64 1, i64 1, i8** nonnull %.5)
  %.7 = icmp eq i32 %.6, 0
  %.21 = alloca { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, align 8
  %.43 = alloca double, align 8
  br i1 %.7, label %common.ret, label %entry.endif, !prof !0

common.ret:                                       ; preds = %entry.endif.endif.endif.thread, %entry, %entry.endif.endif.endif.endif, %entry.endif.if
  %common.ret.op = phi i8* [ null, %entry.endif.if ], [ %.67, %entry.endif.endif.endif.endif ], [ null, %entry ], [ null, %entry.endif.endif.endif.thread ]
  ret i8* %common.ret.op

entry.endif:                                      ; preds = %entry
  %.11 = load i8*, i8** @_ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, align 8
  %.16 = icmp eq i8* %.11, null
  br i1 %.16, label %entry.endif.if, label %entry.endif.endif, !prof !0

entry.endif.if:                                   ; preds = %entry.endif
  call void @PyErr_SetString(i8* nonnull @PyExc_RuntimeError, i8* getelementptr inbounds ([128 x i8], [128 x i8]* @".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE", i64 0, i64 0))
  br label %common.ret

entry.endif.endif:                                ; preds = %entry.endif
  %.20 = load i8*, i8** %.5, align 8
  %.24 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
  %0 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
  call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(56) %0, i8 0, i64 56, i1 false)
  %.25 = call i32 @NRT_adapt_ndarray_from_python(i8* %.20, i8* nonnull %.24)
  %1 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
  %sunkaddr = getelementptr inbounds i8, i8* %1, i64 24
  %2 = bitcast i8* %sunkaddr to i64*
  %.29 = load i64, i64* %2, align 8
  %.30 = icmp ne i64 %.29, 8
  %.31 = icmp ne i32 %.25, 0
  %.32 = or i1 %.31, %.30
  br i1 %.32, label %entry.endif.endif.endif.thread, label %entry.endif.endif.endif.endif, !prof !0

entry.endif.endif.endif.thread:                   ; preds = %entry.endif.endif
  call void @PyErr_SetString(i8* nonnull @PyExc_TypeError, i8* getelementptr inbounds ([89 x i8], [89 x i8]* @".const.can't unbox array from PyObject into native value.  The object maybe of a different type", i64 0, i64 0))
  br label %common.ret

entry.endif.endif.endif.endif:                    ; preds = %entry.endif.endif
  %3 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8**
  %.36.fca.0.load = load i8*, i8** %3, align 8
  %4 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
  %sunkaddr3 = getelementptr inbounds i8, i8* %4, i64 32
  %5 = bitcast i8* %sunkaddr3 to double**
  %.36.fca.4.load = load double*, double** %5, align 8
  %6 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
  %sunkaddr4 = getelementptr inbounds i8, i8* %6, i64 40
  %7 = bitcast i8* %sunkaddr4 to i64*
  %.36.fca.5.0.load = load i64, i64* %7, align 8
  %8 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
  %sunkaddr5 = getelementptr inbounds i8, i8* %8, i64 48
  %9 = bitcast i8* %sunkaddr5 to i64*
  %.36.fca.6.0.load = load i64, i64* %9, align 8
  store double 0.000000e+00, double* %.43, align 8
  %.49 = call i32 @_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE(double* nonnull %.43, { i8*, i32, i8*, i8*, i32 }** nonnull undef, i8* undef, i8* undef, i64 undef, i64 undef, double* %.36.fca.4.load, i64 %.36.fca.5.0.load, i64 %.36.fca.6.0.load) #1
  %.59 = load double, double* %.43, align 8
  call void @NRT_decref(i8* %.36.fca.0.load)
  %.67 = call i8* @PyFloat_FromDouble(double %.59)
  br label %common.ret
}

declare i32 @PyArg_UnpackTuple(i8*, i8*, i64, i64, ...) local_unnamed_addr

declare void @PyErr_SetString(i8*, i8*) local_unnamed_addr

declare i32 @NRT_adapt_ndarray_from_python(i8* nocapture, i8* nocapture) local_unnamed_addr

declare i8* @PyFloat_FromDouble(double) local_unnamed_addr

; Function Attrs: nofree norecurse nosync nounwind
define double @cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE({ i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.1) local_unnamed_addr #0 {
entry:
  %.3 = alloca double, align 8
  store double 0.000000e+00, double* %.3, align 8
  %extracted.data = extractvalue { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.1, 4
  %extracted.shape = extractvalue { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.1, 5
  %.7 = extractvalue [1 x i64] %extracted.shape, 0
  %extracted.strides = extractvalue { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.1, 6
  %.8 = extractvalue [1 x i64] %extracted.strides, 0
  %.9 = call i32 @_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE(double* nonnull %.3, { i8*, i32, i8*, i8*, i32 }** nonnull undef, i8* undef, i8* undef, i64 undef, i64 undef, double* %extracted.data, i64 %.7, i64 %.8) #1
  %.19 = load double, double* %.3, align 8
  ret double %.19
}

; Function Attrs: noinline
define linkonce_odr void @NRT_decref(i8* %.1) local_unnamed_addr #1 {
.3:
  %.4 = icmp eq i8* %.1, null
  br i1 %.4, label %common.ret1, label %.3.endif, !prof !0

common.ret1:                                      ; preds = %.3, %.3.endif
  ret void

.3.endif:                                         ; preds = %.3
  fence release
  %.8 = bitcast i8* %.1 to i64*
  %.4.i = atomicrmw sub i64* %.8, i64 1 monotonic, align 8
  %.10 = icmp eq i64 %.4.i, 1
  br i1 %.10, label %.3.endif.if, label %common.ret1, !prof !0

.3.endif.if:                                      ; preds = %.3.endif
  fence acquire
  tail call void @NRT_MemInfo_call_dtor(i8* nonnull %.1)
  ret void
}

declare void @NRT_MemInfo_call_dtor(i8*) local_unnamed_addr

; Function Attrs: argmemonly nofree nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #2

attributes #0 = { nofree norecurse nosync nounwind }
attributes #1 = { noinline }
attributes #2 = { argmemonly nofree nounwind willreturn writeonly }

!0 = !{!"branch_weights", i32 1, i32 99}
!1 = distinct !{!1, !2}
!2 = !{!"llvm.loop.unroll.disable"}

3ks5zfa0

3ks5zfa01#

我可以通过手动展开循环来压缩额外的性能(但np.max仍然更快)。这完全取决于处理器以及它如何处理缓存和从内存中预取值:

@numba.njit
def nb_max(arr):
    ma = mb = mc = md = -np.inf
    for a, b, c, d in zip(
        arr[::4],
        arr[1::4],
        arr[2::4],
        arr[3::4],
    ):
        ma = max(a, ma)
        mb = max(b, mb)
        mc = max(c, mc)
        md = max(d, md)

    m = max(ma, mb, mc, md)

    # compute the rest:
    if len(arr) % 4 != 0:
        a_max = arr[len(arr) - 3]
        for a in arr[len(arr) - 2 :]:
            a_max = max(a_max, a)
        return max(m, a_max)

    return m

字符串
在我的机器(AMD 5700x)上,对于float64 s的数组创建了这个图:


的数据

相关问题