一些numpy操作在元素数量下降到一定数量以下时会突然提高性能。
这是一个使用numpy来查找数组的最大值的函数。
def np_max(arr):
return np.max(arr)
字符串
为了比较,这里有一个函数,它也使用numba来查找最大值。
@numba.njit
def nb_max(arr):
a_max = arr[0]
for a in arr[1:]:
a_max = max(a_max, a)
return a_max
型
这是用perfplot测量的运行时间。
代码:
def setup(n):
rng = np.random.default_rng(0)
return rng.random(n)
def benchmark_timeit():
for f in [np_max, nb_max]:
f(setup(100))
n_run = 100
for n in np.logspace(6, 7, num=30, dtype=int).tolist():
arr = setup(n)
np_max_time = timeit(lambda: np_max(arr), number=n_run) / n_run
nb_max_time = timeit(lambda: nb_max(arr), number=n_run) / n_run
print(f"n={n,}:"
f" np_max={np_max_time * 1000:.2f}"
f", nb_max={nb_max_time * 1000:.2f}"
f", np/nb={np_max_time / nb_max_time:.2f}")
def benchmark_perfplot():
for f in [np_max, nb_max]:
f(setup(100))
data = perfplot.bench(
n_range=np.logspace(1, 8, num=8 + 7 * 9, dtype=int).tolist(),
# n_range=np.logspace(6, 7, num=30, dtype=int).tolist(),
setup=setup,
kernels=[np_max, nb_max],
equality_check=np.allclose,
target_time_per_measurement=1.0,
)
data.save("./temp2.png")
if __name__ == "__main__":
benchmark_perfplot()
benchmark_timeit()
型
测试结果:
的数据
正如你所看到的,在10^6和10^7之间有一个很大的跳跃。用时间测量的结果也证实了这一点。
n=(1000000,): np_max=0.11, nb_max=0.27, np/nb=0.42 <-- np/nb = numpy-runtime / numba-runtime
n=(1082636,): np_max=0.12, nb_max=0.30, np/nb=0.40
n=(1172102,): np_max=0.13, nb_max=0.32, np/nb=0.40
n=(1268961,): np_max=0.15, nb_max=0.46, np/nb=0.31
n=(1373823,): np_max=0.16, nb_max=0.38, np/nb=0.41
n=(1487352,): np_max=0.17, nb_max=0.41, np/nb=0.41
n=(1610262,): np_max=0.18, nb_max=0.45, np/nb=0.41
n=(1743328,): np_max=0.19, nb_max=0.48, np/nb=0.40
n=(1887391,): np_max=0.21, nb_max=0.52, np/nb=0.41
n=(2043359,): np_max=0.23, nb_max=0.56, np/nb=0.41
n=(2212216,): np_max=0.25, nb_max=0.61, np/nb=0.41
n=(2395026,): np_max=0.30, nb_max=0.69, np/nb=0.44
n=(2592943,): np_max=0.31, nb_max=0.72, np/nb=0.42
n=(2807216,): np_max=0.33, nb_max=0.77, np/nb=0.43
n=(3039195,): np_max=0.38, nb_max=0.88, np/nb=0.43 <-- 0.4
n=(3290344,): np_max=0.50, nb_max=0.97, np/nb=0.51 |
n=(3562247,): np_max=0.65, nb_max=1.07, np/nb=0.61 |
n=(3856620,): np_max=0.80, nb_max=1.19, np/nb=0.67 | 2x difference
n=(4175318,): np_max=1.00, nb_max=1.35, np/nb=0.74 |
n=(4520353,): np_max=1.11, nb_max=1.49, np/nb=0.75 |
n=(4893900,): np_max=1.25, nb_max=1.59, np/nb=0.78 |
n=(5298316,): np_max=1.44, nb_max=1.79, np/nb=0.81 <-- 0.8
n=(5736152,): np_max=1.57, nb_max=1.95, np/nb=0.80
n=(6210169,): np_max=1.71, nb_max=2.08, np/nb=0.82
n=(6723357,): np_max=1.85, nb_max=2.27, np/nb=0.81
n=(7278953,): np_max=2.02, nb_max=2.49, np/nb=0.81
n=(7880462,): np_max=2.17, nb_max=2.67, np/nb=0.81
n=(8531678,): np_max=2.44, nb_max=2.91, np/nb=0.84
n=(9236708,): np_max=2.61, nb_max=3.17, np/nb=0.82
n=(10000000,): np_max=2.81, nb_max=3.50, np/nb=0.80
型
我的问题是:
1.什么样的优化会带来如此大的差异?
1.有没有可能在伦巴中重现它?
请注意,我想要完成的是获得关于这种优化的知识,而不是用像并行化这样的开箱即用的方法来击败numpy。
规格:
- AMD锐龙9 5900X
- RAM 64 GB
- Windows 10
- Python 3.10.11
- numpy 1.26.2
- numba 0.58.1
这里是LLVM IR和汇编代码,n=10**6,其中与numpy的性能差异仍然很大。
def inspect():
n_run = 100
arr = setup(10 ** 6)
nb_max(arr)
print(timeit(lambda: nb_max(arr), number=n_run) / n_run)
t = nb_max.inspect_asm()
assert len(t) == 1
Path("inspect_asm_10-6.txt").write_text(t[list(t)[0]])
t = nb_max.inspect_llvm()
assert len(t) == 1
Path("inspect_llvm_10-6.txt").write_text(t[list(t)[0]])
if __name__ == "__main__":
# benchmark_perfplot()
# benchmark_timeit()
inspect()
型inspect_asm_10-6.txt
:
.text
.file "<string>"
.globl _ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
.p2align 4, 0x90
.type _ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,@function
_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE:
pushq %r14
pushq %rsi
pushq %rdi
pushq %rbx
movq 96(%rsp), %rax
movq 88(%rsp), %r9
movl $1, %r8d
cmpq $2, %rax
jl .LBB0_10
vmovsd (%r9), %xmm0
subq %r8, %rax
testq %rax, %rax
jle .LBB0_9
.LBB0_2:
movq 104(%rsp), %r10
movq %rax, %rdx
sarq $63, %rdx
andnq %rax, %rdx, %r11
movl %r11d, %edx
leaq -1(%r11), %rax
andl $7, %edx
cmpq $7, %rax
jae .LBB0_4
xorl %eax, %eax
jmp .LBB0_6
.LBB0_4:
movabsq $9223372036854775800, %rax
leaq (%r9,%r8,8), %rdi
leaq (%r10,%r10,2), %r14
andq %rax, %r11
xorl %eax, %eax
.p2align 4, 0x90
.LBB0_5:
vmovsd (%rdi), %xmm1
vmovsd (%r10,%rdi), %xmm2
vmovsd (%rdi,%r10,2), %xmm3
leaq (%r14,%rdi), %rbx
addq $8, %rax
vmaxsd %xmm0, %xmm1, %xmm0
vmaxsd %xmm0, %xmm2, %xmm0
vmovsd (%r14,%rdi), %xmm2
leaq (%r10,%rbx), %rdi
vmaxsd %xmm0, %xmm3, %xmm0
vmovsd (%r10,%rbx), %xmm3
leaq (%r10,%rdi), %rbx
leaq (%r10,%rbx), %rsi
vmaxsd %xmm0, %xmm2, %xmm0
vmovsd (%r10,%rdi), %xmm2
leaq (%r10,%rsi), %rdi
vmaxsd %xmm0, %xmm3, %xmm0
vmovsd (%r10,%rbx), %xmm3
addq %r10, %rdi
vmaxsd %xmm0, %xmm2, %xmm0
vmovsd (%r10,%rsi), %xmm2
vmaxsd %xmm0, %xmm3, %xmm0
vmaxsd %xmm0, %xmm2, %xmm0
cmpq %rax, %r11
jne .LBB0_5
.LBB0_6:
testq %rdx, %rdx
je .LBB0_9
imulq %r10, %rax
addq %rax, %r9
leaq (%r9,%r8,8), %rax
.p2align 4, 0x90
.LBB0_8:
vmovsd (%rax), %xmm1
addq %r10, %rax
decq %rdx
vmaxsd %xmm0, %xmm1, %xmm0
jne .LBB0_8
.LBB0_9:
vmovsd %xmm0, (%rcx)
xorl %eax, %eax
popq %rbx
popq %rdi
popq %rsi
popq %r14
retq
.LBB0_10:
movq %rax, %r8
vmovsd (%r9), %xmm0
subq %r8, %rax
testq %rax, %rax
jg .LBB0_2
jmp .LBB0_9
.Lfunc_end0:
.size _ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, .Lfunc_end0-_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
.globl _ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
.p2align 4, 0x90
.type _ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,@function
_ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %rsi
andq $-32, %rsp
subq $224, %rsp
vmovaps %xmm6, -32(%rbp)
.cfi_offset %rsi, -24
.cfi_offset %xmm6, -48
leaq 88(%rsp), %rax
movq %rdx, %rcx
movabsq $.const.nb_max, %rdx
movl $1, %r8d
movl $1, %r9d
movq %rax, 32(%rsp)
movabsq $PyArg_UnpackTuple, %rax
callq *%rax
testl %eax, %eax
je .LBB1_1
movabsq $_ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, %rax
cmpq $0, (%rax)
je .LBB1_4
movq 88(%rsp), %rcx
movabsq $NRT_adapt_ndarray_from_python, %rax
leaq 96(%rsp), %rdx
vxorps %xmm0, %xmm0, %xmm0
vmovaps %ymm0, 96(%rsp)
vmovups %ymm0, 120(%rsp)
vzeroupper
callq *%rax
testl %eax, %eax
jne .LBB1_8
cmpq $8, 120(%rsp)
jne .LBB1_8
vmovaps 128(%rsp), %xmm0
movq 144(%rsp), %rax
movq 96(%rsp), %rsi
leaq 80(%rsp), %rcx
movq $0, 80(%rsp)
movq %rax, 64(%rsp)
movabsq $_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, %rax
vmovups %xmm0, 48(%rsp)
callq *%rax
vmovsd 80(%rsp), %xmm6
movabsq $NRT_decref, %rax
movq %rsi, %rcx
callq *%rax
movabsq $PyFloat_FromDouble, %rax
vmovaps %xmm6, %xmm0
callq *%rax
.LBB1_2:
vmovaps -32(%rbp), %xmm6
leaq -8(%rbp), %rsp
popq %rsi
popq %rbp
retq
.LBB1_4:
movabsq $PyExc_RuntimeError, %rcx
movabsq $".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE", %rdx
jmp .LBB1_5
.LBB1_8:
movabsq $PyExc_TypeError, %rcx
movabsq $".const.can't unbox array from PyObject into native value. The object maybe of a different type", %rdx
.LBB1_5:
movabsq $PyErr_SetString, %rax
callq *%rax
.LBB1_1:
xorl %eax, %eax
jmp .LBB1_2
.Lfunc_end1:
.size _ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, .Lfunc_end1-_ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
.cfi_endproc
.globl cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
.p2align 4, 0x90
.type cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,@function
cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE:
subq $88, %rsp
vmovaps 128(%rsp), %xmm0
movq 144(%rsp), %rax
leaq 80(%rsp), %rcx
movq $0, 80(%rsp)
movq %rax, 64(%rsp)
movabsq $_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, %rax
vmovups %xmm0, 48(%rsp)
callq *%rax
vmovsd 80(%rsp), %xmm0
addq $88, %rsp
retq
.Lfunc_end2:
.size cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, .Lfunc_end2-cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE
.weak NRT_decref
.p2align 4, 0x90
.type NRT_decref,@function
NRT_decref:
.cfi_startproc
testq %rcx, %rcx
je .LBB3_2
#MEMBARRIER
lock decq (%rcx)
je .LBB3_3
.LBB3_2:
retq
.LBB3_3:
movabsq $NRT_MemInfo_call_dtor, %rax
#MEMBARRIER
rex64 jmpq *%rax
.Lfunc_end3:
.size NRT_decref, .Lfunc_end3-NRT_decref
.cfi_endproc
.type .const.nb_max,@object
.section .rodata,"a",@progbits
.const.nb_max:
.asciz "nb_max"
.size .const.nb_max, 7
.type _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,@object
.comm _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE,8,8
.type ".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE",@object
.p2align 4
".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE":
.asciz "missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE"
.size ".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE", 128
.type ".const.can't unbox array from PyObject into native value. The object maybe of a different type",@object
.p2align 4
".const.can't unbox array from PyObject into native value. The object maybe of a different type":
.asciz "can't unbox array from PyObject into native value. The object maybe of a different type"
.size ".const.can't unbox array from PyObject into native value. The object maybe of a different type", 89
.section ".note.GNU-stack","",@progbits
型inspect_llvm_10-6.txt
:
; ModuleID = 'nb_max'
source_filename = "<string>"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc"
@.const.nb_max = internal constant [7 x i8] c"nb_max\00"
@_ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE = common local_unnamed_addr global i8* null
@".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE" = internal constant [128 x i8] c"missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE\00"
@PyExc_TypeError = external global i8
@".const.can't unbox array from PyObject into native value. The object maybe of a different type" = internal constant [89 x i8] c"can't unbox array from PyObject into native value. The object maybe of a different type\00"
@PyExc_RuntimeError = external global i8
; Function Attrs: nofree norecurse nosync nounwind
define i32 @_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE(double* noalias nocapture writeonly %retptr, { i8*, i32, i8*, i8*, i32 }** noalias nocapture readnone %excinfo, i8* nocapture readnone %arg.arr.0, i8* nocapture readnone %arg.arr.1, i64 %arg.arr.2, i64 %arg.arr.3, double* %arg.arr.4, i64 %arg.arr.5.0, i64 %arg.arr.6.0) local_unnamed_addr #0 {
B0.else.endif:
%.44 = load double, double* %arg.arr.4, align 8
%.93 = icmp slt i64 %arg.arr.5.0, 2
br i1 %.93, label %B0.else.endif.if, label %B0.endif, !prof !0
B24: ; preds = %B24, %B24.lr.ph.new
%lsr.iv9 = phi i64 [ %8, %B24 ], [ %16, %B24.lr.ph.new ]
%a_max.2.04 = phi double [ %.44, %B24.lr.ph.new ], [ %.321.7, %B24 ]
%.224.03 = phi i64 [ 0, %B24.lr.ph.new ], [ %.294.7, %B24 ]
%.290 = inttoptr i64 %lsr.iv9 to double*
%.291 = load double, double* %.290, align 8
%.320 = fcmp ogt double %.291, %a_max.2.04
%.321 = select i1 %.320, double %.291, double %a_max.2.04
%0 = add i64 %arg.arr.6.0, %lsr.iv9
%.290.1 = inttoptr i64 %0 to double*
%.291.1 = load double, double* %.290.1, align 8
%.320.1 = fcmp ogt double %.291.1, %.321
%.321.1 = select i1 %.320.1, double %.291.1, double %.321
%sunkaddr = inttoptr i64 %lsr.iv9 to double*
%sunkaddr11 = mul i64 %arg.arr.6.0, 2
%1 = bitcast double* %sunkaddr to i8*
%sunkaddr12 = getelementptr i8, i8* %1, i64 %sunkaddr11
%2 = bitcast i8* %sunkaddr12 to double*
%.291.2 = load double, double* %2, align 8
%.320.2 = fcmp ogt double %.291.2, %.321.1
%.321.2 = select i1 %.320.2, double %.291.2, double %.321.1
%3 = add i64 %17, %lsr.iv9
%.290.3 = inttoptr i64 %3 to double*
%.291.3 = load double, double* %.290.3, align 8
%.320.3 = fcmp ogt double %.291.3, %.321.2
%.321.3 = select i1 %.320.3, double %.291.3, double %.321.2
%4 = add i64 %arg.arr.6.0, %3
%.290.4 = inttoptr i64 %4 to double*
%.291.4 = load double, double* %.290.4, align 8
%.320.4 = fcmp ogt double %.291.4, %.321.3
%.321.4 = select i1 %.320.4, double %.291.4, double %.321.3
%5 = add i64 %arg.arr.6.0, %4
%.290.5 = inttoptr i64 %5 to double*
%.291.5 = load double, double* %.290.5, align 8
%.320.5 = fcmp ogt double %.291.5, %.321.4
%.321.5 = select i1 %.320.5, double %.291.5, double %.321.4
%6 = add i64 %arg.arr.6.0, %5
%.290.6 = inttoptr i64 %6 to double*
%.291.6 = load double, double* %.290.6, align 8
%.320.6 = fcmp ogt double %.291.6, %.321.5
%.321.6 = select i1 %.320.6, double %.291.6, double %.321.5
%7 = add i64 %arg.arr.6.0, %6
%.290.7 = inttoptr i64 %7 to double*
%.291.7 = load double, double* %.290.7, align 8
%.294.7 = add nuw i64 %.224.03, 8
%.320.7 = fcmp ogt double %.291.7, %.321.6
%.321.7 = select i1 %.320.7, double %.291.7, double %.321.6
%niter.ncmp.7 = icmp eq i64 %unroll_iter, %.294.7
%8 = add i64 %arg.arr.6.0, %7
br i1 %niter.ncmp.7, label %B38.loopexit.unr-lcssa, label %B24
B38.loopexit.unr-lcssa: ; preds = %B24, %B24.lr.ph
%.321.lcssa.ph = phi double [ undef, %B24.lr.ph ], [ %.321.7, %B24 ]
%a_max.2.04.unr = phi double [ %.44, %B24.lr.ph ], [ %.321.7, %B24 ]
%.224.03.unr = phi i64 [ 0, %B24.lr.ph ], [ %.294.7, %B24 ]
%lcmp.mod.not = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod.not, label %B38, label %B24.epil.preheader
B24.epil.preheader: ; preds = %B38.loopexit.unr-lcssa
%9 = ptrtoint double* %arg.arr.4 to i64
%10 = mul i64 %.224.03.unr, %arg.arr.6.0
%11 = add i64 %9, %10
%12 = shl i64 %.73.sroa.0.0, 3
%13 = add i64 %11, %12
br label %B24.epil
B24.epil: ; preds = %B24.epil.preheader, %B24.epil
%lsr.iv7 = phi i64 [ %xtraiter, %B24.epil.preheader ], [ %lsr.iv.next8, %B24.epil ]
%lsr.iv = phi i64 [ %13, %B24.epil.preheader ], [ %lsr.iv.next, %B24.epil ]
%a_max.2.04.epil = phi double [ %.321.epil, %B24.epil ], [ %a_max.2.04.unr, %B24.epil.preheader ]
%.290.epil = inttoptr i64 %lsr.iv to double*
%.291.epil = load double, double* %.290.epil, align 8
%.320.epil = fcmp ogt double %.291.epil, %a_max.2.04.epil
%.321.epil = select i1 %.320.epil, double %.291.epil, double %a_max.2.04.epil
%lsr.iv.next = add i64 %lsr.iv, %arg.arr.6.0
%lsr.iv.next8 = add nsw i64 %lsr.iv7, -1
%epil.iter.cmp.not = icmp eq i64 %lsr.iv.next8, 0
br i1 %epil.iter.cmp.not, label %B38, label %B24.epil, !llvm.loop !1
B38: ; preds = %B24.epil, %B38.loopexit.unr-lcssa, %B0.endif
%a_max.2.0.lcssa = phi double [ %.44, %B0.endif ], [ %.321.lcssa.ph, %B38.loopexit.unr-lcssa ], [ %.321.epil, %B24.epil ]
store double %a_max.2.0.lcssa, double* %retptr, align 8
ret i32 0
B0.endif: ; preds = %B0.else.endif, %B0.else.endif.if
%.73.sroa.0.0 = phi i64 [ %arg.arr.5.0, %B0.else.endif.if ], [ 1, %B0.else.endif ]
%.161 = sub i64 %arg.arr.5.0, %.73.sroa.0.0
%.168.inv = icmp sgt i64 %.161, 0
%.170 = select i1 %.168.inv, i64 %.161, i64 0
br i1 %.168.inv, label %B24.lr.ph, label %B38
B24.lr.ph: ; preds = %B0.endif
%.186 = getelementptr double, double* %arg.arr.4, i64 %.73.sroa.0.0
%14 = add nsw i64 %.170, -1
%xtraiter = and i64 %.170, 7
%15 = icmp ult i64 %14, 7
br i1 %15, label %B38.loopexit.unr-lcssa, label %B24.lr.ph.new
B24.lr.ph.new: ; preds = %B24.lr.ph
%16 = ptrtoint double* %.186 to i64
%unroll_iter = and i64 %.170, 9223372036854775800
%17 = mul i64 %arg.arr.6.0, 3
br label %B24
B0.else.endif.if: ; preds = %B0.else.endif
br label %B0.endif
}
define i8* @_ZN7cpython8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE(i8* nocapture readnone %py_closure, i8* %py_args, i8* nocapture readnone %py_kws) local_unnamed_addr {
entry:
%.5 = alloca i8*, align 8
%.6 = call i32 (i8*, i8*, i64, i64, ...) @PyArg_UnpackTuple(i8* %py_args, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.const.nb_max, i64 0, i64 0), i64 1, i64 1, i8** nonnull %.5)
%.7 = icmp eq i32 %.6, 0
%.21 = alloca { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, align 8
%.43 = alloca double, align 8
br i1 %.7, label %common.ret, label %entry.endif, !prof !0
common.ret: ; preds = %entry.endif.endif.endif.thread, %entry, %entry.endif.endif.endif.endif, %entry.endif.if
%common.ret.op = phi i8* [ null, %entry.endif.if ], [ %.67, %entry.endif.endif.endif.endif ], [ null, %entry ], [ null, %entry.endif.endif.endif.thread ]
ret i8* %common.ret.op
entry.endif: ; preds = %entry
%.11 = load i8*, i8** @_ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE, align 8
%.16 = icmp eq i8* %.11, null
br i1 %.16, label %entry.endif.if, label %entry.endif.endif, !prof !0
entry.endif.if: ; preds = %entry.endif
call void @PyErr_SetString(i8* nonnull @PyExc_RuntimeError, i8* getelementptr inbounds ([128 x i8], [128 x i8]* @".const.missing Environment: _ZN08NumbaEnv8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE", i64 0, i64 0))
br label %common.ret
entry.endif.endif: ; preds = %entry.endif
%.20 = load i8*, i8** %.5, align 8
%.24 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
%0 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(56) %0, i8 0, i64 56, i1 false)
%.25 = call i32 @NRT_adapt_ndarray_from_python(i8* %.20, i8* nonnull %.24)
%1 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
%sunkaddr = getelementptr inbounds i8, i8* %1, i64 24
%2 = bitcast i8* %sunkaddr to i64*
%.29 = load i64, i64* %2, align 8
%.30 = icmp ne i64 %.29, 8
%.31 = icmp ne i32 %.25, 0
%.32 = or i1 %.31, %.30
br i1 %.32, label %entry.endif.endif.endif.thread, label %entry.endif.endif.endif.endif, !prof !0
entry.endif.endif.endif.thread: ; preds = %entry.endif.endif
call void @PyErr_SetString(i8* nonnull @PyExc_TypeError, i8* getelementptr inbounds ([89 x i8], [89 x i8]* @".const.can't unbox array from PyObject into native value. The object maybe of a different type", i64 0, i64 0))
br label %common.ret
entry.endif.endif.endif.endif: ; preds = %entry.endif.endif
%3 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8**
%.36.fca.0.load = load i8*, i8** %3, align 8
%4 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
%sunkaddr3 = getelementptr inbounds i8, i8* %4, i64 32
%5 = bitcast i8* %sunkaddr3 to double**
%.36.fca.4.load = load double*, double** %5, align 8
%6 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
%sunkaddr4 = getelementptr inbounds i8, i8* %6, i64 40
%7 = bitcast i8* %sunkaddr4 to i64*
%.36.fca.5.0.load = load i64, i64* %7, align 8
%8 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.21 to i8*
%sunkaddr5 = getelementptr inbounds i8, i8* %8, i64 48
%9 = bitcast i8* %sunkaddr5 to i64*
%.36.fca.6.0.load = load i64, i64* %9, align 8
store double 0.000000e+00, double* %.43, align 8
%.49 = call i32 @_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE(double* nonnull %.43, { i8*, i32, i8*, i8*, i32 }** nonnull undef, i8* undef, i8* undef, i64 undef, i64 undef, double* %.36.fca.4.load, i64 %.36.fca.5.0.load, i64 %.36.fca.6.0.load) #1
%.59 = load double, double* %.43, align 8
call void @NRT_decref(i8* %.36.fca.0.load)
%.67 = call i8* @PyFloat_FromDouble(double %.59)
br label %common.ret
}
declare i32 @PyArg_UnpackTuple(i8*, i8*, i64, i64, ...) local_unnamed_addr
declare void @PyErr_SetString(i8*, i8*) local_unnamed_addr
declare i32 @NRT_adapt_ndarray_from_python(i8* nocapture, i8* nocapture) local_unnamed_addr
declare i8* @PyFloat_FromDouble(double) local_unnamed_addr
; Function Attrs: nofree norecurse nosync nounwind
define double @cfunc._ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE({ i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.1) local_unnamed_addr #0 {
entry:
%.3 = alloca double, align 8
store double 0.000000e+00, double* %.3, align 8
%extracted.data = extractvalue { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.1, 4
%extracted.shape = extractvalue { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.1, 5
%.7 = extractvalue [1 x i64] %extracted.shape, 0
%extracted.strides = extractvalue { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.1, 6
%.8 = extractvalue [1 x i64] %extracted.strides, 0
%.9 = call i32 @_ZN8__main__6nb_maxB2v1B38c8tJTIcFKzyF2ILShI4CrgQElQb6HczSBAA_3dE5ArrayIdLi1E1C7mutable7alignedE(double* nonnull %.3, { i8*, i32, i8*, i8*, i32 }** nonnull undef, i8* undef, i8* undef, i64 undef, i64 undef, double* %extracted.data, i64 %.7, i64 %.8) #1
%.19 = load double, double* %.3, align 8
ret double %.19
}
; Function Attrs: noinline
define linkonce_odr void @NRT_decref(i8* %.1) local_unnamed_addr #1 {
.3:
%.4 = icmp eq i8* %.1, null
br i1 %.4, label %common.ret1, label %.3.endif, !prof !0
common.ret1: ; preds = %.3, %.3.endif
ret void
.3.endif: ; preds = %.3
fence release
%.8 = bitcast i8* %.1 to i64*
%.4.i = atomicrmw sub i64* %.8, i64 1 monotonic, align 8
%.10 = icmp eq i64 %.4.i, 1
br i1 %.10, label %.3.endif.if, label %common.ret1, !prof !0
.3.endif.if: ; preds = %.3.endif
fence acquire
tail call void @NRT_MemInfo_call_dtor(i8* nonnull %.1)
ret void
}
declare void @NRT_MemInfo_call_dtor(i8*) local_unnamed_addr
; Function Attrs: argmemonly nofree nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #2
attributes #0 = { nofree norecurse nosync nounwind }
attributes #1 = { noinline }
attributes #2 = { argmemonly nofree nounwind willreturn writeonly }
!0 = !{!"branch_weights", i32 1, i32 99}
!1 = distinct !{!1, !2}
!2 = !{!"llvm.loop.unroll.disable"}
型
1条答案
按热度按时间3ks5zfa01#
我可以通过手动展开循环来压缩额外的性能(但
np.max
仍然更快)。这完全取决于处理器以及它如何处理缓存和从内存中预取值:字符串
在我的机器(AMD 5700x)上,对于
float64
s的数组创建了这个图:的数据