我正在使用Algorand合约代码,它在汇编代码中的可能操作范围非常有限-例如,无法控制代码流。基本的64位算术操作可用。我需要做的是将某个128位整数的两个64位段(作为一个整体128位数)除以另一个64位整数,知道结果将适合64位整数。是否可能不控制代码流而仅使用64位算术?
u5i3ibmn1#
我不熟悉Algorand语言。当然可以创建无分支代码,用于将128位操作数除以64位操作数,始终使用64位操作。最简单的方法是使用我们在小学都学过的长手除法和乘法算法,但使用232作为基数,而不是10。一般来说,除了简单的算术运算外,这还需要移位和逻辑运算,我假设这在下面的ISO-C99代码中是可用的。人们可以使用现有的64位除法来生成下一个商“数字”的接近估计,通过将被除数的前导两个“数字”除以除数的前导“数字”,其中“数字”被理解为基数232。在TAOCP中,Knuth表明,如果除数被归一化,则误差为正且至多为2,即,它的最高有效位被设置。我们通过反乘和减法计算余数,然后可以检查它是否为负。如果是,我们将(部分)商降低1并重新计算余数。由于不允许分支,因此需要一种机制来从两个操作数中选择两个可能的有条件选择。这可以通过将 predicate 计算为一个取0或1的整数,并将其传递给一个2:1多路复用器函数来完成,该函数将一个源操作数与 predicate 相乘,另一个源操作数与 predicate 的补码相乘,并将结果相加。在下面的代码中,此函数称为mux_64()。我测试了函数my_div_128_64()中的除法代码,将其与我的64位Intel CPU的DIV指令进行比较,通过内联汇编访问该指令。代码是使用Intel编译器版本13构建的。我实现了两种测试模式,一个基于纯粹的随机操作数,另一个基于简单的模式,这些模式对于命中各种边界情况都很有用。没有发现不匹配到目前为止。更复杂的测试是可能的,但这对于一个像样的“烟雾”测试应该足够了。
mux_64()
my_div_128_64()
DIV
#include <stdio.h> #include <stdlib.h> #include <stdint.h> #define PATTERN_TEST (1) #define MSVC_STYLE (1) #define GCC_STYLE (2) #define ASM_STYLE (GCC_STYLE) // our own 128-bit unsigned integer type typedef struct { uint64_t x; uint64_t y; } ulonglong2; // (a < b) ? 1 : 0 uint64_t ltu_64 (uint64_t a, uint64_t b) { return ((~a & b) + ((~a ^ b) >> 1)) >> 63; } // (a != b) ? 1 : 0 uint64_t neq_64 (uint64_t a, uint64_t b) { return ((((a ^ b) | (1ULL << 63)) - 1ULL) | (a ^ b)) >> 63; } // (a >= b) ? 1 : 0 uint64_t geu_64 (uint64_t a, uint64_t b) { return ((a | ~b) - ((a ^ ~b) >> 1)) >> 63; } //#define ADDcc(a,b,cy,t0,t1) (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0) #define ADDcc(a,b,cy,t0,t1) (t0=(b), t1=(a), t0=t0+t1, cy=ltu_64(t0,t1), t0=t0) #define ADDC(a,b,cy,t0,t1) (t0=(b)+cy, t1=(a), t0+t1) //#define SUBcc(a,b,cy,t0,t1) (t0=(b), t1=(a), cy=t1<t0, t1-t0) #define SUBcc(a,b,cy,t0,t1) (t0=(b), t1=(a), cy=ltu_64(t1,t0), t1-t0) #define SUBC(a,b,cy,t0,t1) (t0=(b)+cy, t1=(a), t1-t0) // 128-bit subtraction ulonglong2 sub_128 (ulonglong2 a, ulonglong2 b) { ulonglong2 res; uint64_t cy, t0, t1; res.x = SUBcc (a.x, b.x, cy, t0, t1); res.y = SUBC (a.y, b.y, cy, t0, t1); return res; } // 128-bit addition ulonglong2 add_128 (ulonglong2 a, ulonglong2 b) { ulonglong2 res; uint64_t cy, t0, t1; res.x = ADDcc (a.x, b.x, cy, t0, t1); res.y = ADDC (a.y, b.y, cy, t0, t1); return res; } // branch free implementation of ((cond) ? a : b). cond must be in {0,1} uint64_t mux_64 (uint64_t cond, uint64_t a, int64_t b) { return cond * a + (1 - cond) * b; } // count leading zeros uint64_t clz_64 (uint64_t x) { uint64_t n = 64; uint64_t y; y = x >> 32; n = mux_64 (neq_64 (y, 0), n - 32, n); x = mux_64 (neq_64 (y, 0), y, x); y = x >> 16; n = mux_64 (neq_64 (y, 0), n - 16, n); x = mux_64 (neq_64 (y, 0), y, x); y = x >> 8; n = mux_64 (neq_64 (y, 0), n - 8, n); x = mux_64 (neq_64 (y, 0), y, x); y = x >> 4; n = mux_64 (neq_64 (y, 0), n - 4, n); x = mux_64 (neq_64 (y, 0), y, x); y = x >> 2; n = mux_64 (neq_64 (y, 0), n - 2, n); x = mux_64 (neq_64 (y, 0), y, x); y = x >> 1; n = mux_64 (neq_64 (y, 0), n - 2, n - x); return n; } // 64x64->128 bit unsigned multiplication ulonglong2 my_mul_64_128 (uint64_t a, uint64_t b) { ulonglong2 res; const uint64_t mask_lo32 = 0xffffffffULL; uint64_t a_lo = a & mask_lo32; uint64_t a_hi = a >> 32; uint64_t b_lo = b & mask_lo32; uint64_t b_hi = b >> 32; uint64_t p0 = a_lo * b_lo; uint64_t p1 = a_lo * b_hi; uint64_t p2 = a_hi * b_lo; uint64_t p3 = a_hi * b_hi; uint64_t cy = ((p0 >> 32) + (p1 & mask_lo32) + (p2 & mask_lo32)) >> 32; res.x = p0 + (p1 << 32) + (p2 << 32); res.y = p3 + (p1 >> 32) + (p2 >> 32) + cy; return res; } // 128/64->64 bit unsigned division uint64_t my_div_128_64 (ulonglong2 dvnd, uint64_t dvsr) { ulonglong2 prod, rem; uint64_t lz, q, qh, ql, rem_msb, nrm_dvsr, cy, t0, t1; // normalize divisor; adjust dividend accordingly (initial partial remainder) lz = clz_64 (dvsr); nrm_dvsr = dvsr << lz; rem.y = (dvnd.y << lz) | mux_64 (neq_64 (lz, 0), dvnd.x >> (64 - lz), 0); rem.x = dvnd.x << lz; // compute most significant quotient "digit"; TAOCP: may be off by 0, +1, +2 qh = mux_64 (geu_64 (rem.y >> 32, nrm_dvsr >> 32), 0xffffffff, rem.y / (nrm_dvsr >> 32)); // compute remainder; correct quotient "digit" if remainder negative prod = my_mul_64_128 (qh << 32, nrm_dvsr); rem = sub_128 (rem, prod); rem_msb= rem.y >> 63; qh = mux_64 (rem_msb, qh - 1, qh); rem.x = mux_64 (rem_msb, ADDcc (rem.x, nrm_dvsr << 32, cy, t0, t1), rem.x); rem.y = mux_64 (rem_msb, ADDC (rem.y, nrm_dvsr >> 32, cy, t0, t1), rem.y); rem_msb= rem.y >> 63; qh = mux_64 (rem_msb, qh - 1, qh); rem.x = mux_64 (rem_msb, ADDcc (rem.x, nrm_dvsr << 32, cy, t0, t1), rem.x); rem.y = mux_64 (rem_msb, ADDC (rem.y, nrm_dvsr >> 32, cy, t0, t1), rem.y); // pull down next dividend "digit" rem.y = (rem.y << 32) | (rem.x >> 32); rem.x = rem.x << 32; // compute least significant quotient "digit"; TAOCP: may be off by 0, +1, +2 ql = mux_64 (geu_64 (rem.y >> 32, nrm_dvsr >> 32), 0xffffffff, rem.y / (nrm_dvsr >> 32)); // combine partial results into complete quotient q = (qh << 32) + ql; // compute remainder; correct quotient "digit" if remainder negative prod = my_mul_64_128 (q, dvsr); rem = sub_128 (dvnd, prod); rem_msb= rem.y >> 63; q = mux_64 (rem_msb, q - 1, q); rem.x = mux_64 (rem_msb, ADDcc (rem.x, dvsr, cy, t0, t1), rem.x); rem.y = mux_64 (rem_msb, ADDC (rem.y, 0, cy, t0, t1), rem.y); rem_msb= rem.y >> 63; q = mux_64 (rem_msb, q - 1, q); return q; } uint64_t ref_div_128_64 (ulonglong2 dvnd, uint64_t dvsr) { uint64_t quot; #if ASM_STYLE == MSVC_STYLE __asm mov rax, qword ptr [dvnd.x]; __asm mov rdx, qword ptr [dvnd.y]; __asm div qword ptr [dvsr]; __asm mov qword ptr [quot], rax; #else // ASM_STYLE __asm__ ( "movq %1, %%rax\n\t" "movq %2, %%rdx\n\t" "divq %3\n\t" "movq %%rax, %0\n\t" : "=r" (quot) : "r" (dvnd.x), "r" (dvnd.y), "r" (dvsr) : "rax", "rdx"); #endif // ASM_STYLE return quot; } ulonglong2 ref_mul_64_128 (uint64_t a, uint64_t b) { ulonglong2 prod; #if ASM_STYLE == MSVC_STYLE __asm mov rax, qword ptr [a]; __asm mul qword ptr [b]; __asm mov qword ptr [prod.x], rax; __asm mov qword ptr [prod.y], rdx; #else // ASM_STYLE __asm__( "movq %2, %%rax\n\t" "mulq %3\n\t" "movq %%rax, %0\n\t" "movq %%rdx, %1\n\t" : "=r" (prod.x), "=r" (prod.y) : "r" (a), "r" (b) : "rax", "rdx"); #endif // ASM_STYLE return prod; } /* https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J From: geo <[email protected]> Newsgroups: sci.math,comp.lang.c,comp.lang.fortran Subject: 64-bit KISS RNGs Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST) This 64-bit KISS RNG has three components, each nearly good enough to serve alone. The components are: Multiply-With-Carry (MWC), period (2^121+2^63-1) Xorshift (XSH), period 2^64-1 Congruential (CNG), period 2^64 */ static uint64_t kiss64_x = 1234567890987654321ULL; static uint64_t kiss64_c = 123456123456123456ULL; static uint64_t kiss64_y = 362436362436362436ULL; static uint64_t kiss64_z = 1066149217761810ULL; static uint64_t kiss64_t; #define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \ kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \ kiss64_c += (kiss64_x < kiss64_t), kiss64_x) #define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \ kiss64_y ^= (kiss64_y << 43)) #define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL) #define KISS64 (MWC64 + XSH64 + CNG64) #define PATTERN (1) int main (void) { ulonglong2 dvnd, prod, diff; uint64_t dvsr, ref, res, count = 0; do { count++; do { #if PATTERN_TEST ulonglong2 temp1, temp2; uint64_t shift, shift1, shift2; shift = KISS64 & 127; temp1.y = (shift > 63) ? (1ULL << (shift - 64)) : 0; temp1.x = (shift > 63) ? 0ULL : (1ULL << shift); shift = KISS64 & 127; temp2.y = (shift > 63) ? (1ULL << (shift - 64)) : 0; temp2.x = (shift > 63) ? 0ULL : (1ULL << shift); dvnd = (KISS64 & 1) ? add_128 (temp1, temp2) : sub_128 (temp1, temp2); shift1 = KISS64 & 63; shift2 = KISS64 & 63; dvsr = (KISS64 & 1) ? ((1ULL << shift1) + (1ULL << shift2)) : ((1ULL << shift1) - (1ULL << shift2)); #else // PATTERN_TEST dvnd.y = KISS64; dvnd.x = KISS64; dvsr = KISS64; #endif // PATTERN_TEST } while ((dvsr == 0ULL) || (dvnd.y >= dvsr)); // guard against overflow ref = ref_div_128_64 (dvnd, dvsr); res = my_div_128_64 (dvnd, dvsr); prod = ref_mul_64_128 (res, dvsr); diff = sub_128 (dvnd, prod); if ((diff.y != 0) || (diff.x >= dvsr)) { printf ("error @ chk1: dvnd= %016llx_%016llx dvsr=%016llx res=%016llx ref=%016llx diff=%016llx_%016llx\n", dvnd.y, dvnd.x, dvsr, res, ref, diff.y, diff.x); return EXIT_FAILURE; } if (res != ref) { printf ("error @ chk2: dvnd= %016llx_%016llx dvsr=%016llx res=%016llx ref=%016llx diff=%016llx_%016llx\n", dvnd.y, dvnd.x, dvsr, res, ref, diff.y, diff.x); return EXIT_FAILURE; } if ((count & 0xffffff) == 0) printf ("\r%llu", count); } while (dvsr); return EXIT_SUCCESS; }
字符串如果编程语言受到更严格的限制,所有的移位和所有的逻辑运算,除了那些在 predicate 函数neq_64(),ltu_64()和geu_64()中使用的,都可以用算术来代替。示例ISO-C99代码如下所示。可能有编程语言特定的方法来实现这些 predicate 而不使用逻辑运算。
neq_64
ltu_64()
geu_64()
#include <stdio.h> #include <stdlib.h> #include <stdint.h> #define PATTERN_TEST (0) #define MSVC_STYLE (1) #define GCC_STYLE (2) #define ASM_STYLE (GCC_STYLE) #define POW2_63 (0x8000000000000000ULL) #define POW2_32 (0x0000000100000000ULL) #define POW2_16 (0x0000000000010000ULL) #define POW2_8 (0x0000000000000100ULL) #define POW2_5 (0x0000000000000020ULL) #define POW2_4 (0x0000000000000010ULL) #define POW2_3 (0x0000000000000008ULL) #define POW2_2 (0x0000000000000004ULL) #define POW2_1 (0x0000000000000002ULL) #define POW2_0 (0x0000000000000001ULL) // our own 128-bit unsigned integer type typedef struct { uint64_t x; uint64_t y; } ulonglong2; // (a < b) ? 1 : 0 uint64_t ltu_64 (uint64_t a, uint64_t b) { return ((~a & b) + ((~a ^ b) / POW2_1)) / POW2_63; } // (a >= b) ? 1 : 0 uint64_t geu_64 (uint64_t a, uint64_t b) { return ((a | ~b) - ((a ^ ~b) / POW2_1)) / POW2_63; } // (a != b) ? 1 : 0 uint64_t neq_64 (uint64_t a, uint64_t b) { return ((a - b) | (b - a)) / POW2_63; } //#define ADDcc(a,b,cy,t0,t1) (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0) #define ADDcc(a,b,cy,t0,t1) (t0=(b), t1=(a), t0=t0+t1, cy=ltu_64(t0,t1), t0=t0) #define ADDC(a,b,cy,t0,t1) (t0=(b)+cy, t1=(a), t0+t1) //#define SUBcc(a,b,cy,t0,t1) (t0=(b), t1=(a), cy=t1<t0, t1-t0) #define SUBcc(a,b,cy,t0,t1) (t0=(b), t1=(a), cy=ltu_64(t1,t0), t1-t0) #define SUBC(a,b,cy,t0,t1) (t0=(b)+cy, t1=(a), t1-t0) // 128-bit subtraction ulonglong2 sub_128 (ulonglong2 a, ulonglong2 b) { ulonglong2 res; uint64_t cy, t0, t1; res.x = SUBcc (a.x, b.x, cy, t0, t1); res.y = SUBC (a.y, b.y, cy, t0, t1); return res; } // 128-bit addition ulonglong2 add_128 (ulonglong2 a, ulonglong2 b) { ulonglong2 res; uint64_t cy, t0, t1; res.x = ADDcc (a.x, b.x, cy, t0, t1); res.y = ADDC (a.y, b.y, cy, t0, t1); return res; } // branch free implementation of ((cond) ? a : b). cond must be in {0,1} uint64_t mux_64 (uint64_t cond, uint64_t a, int64_t b) { return cond * a + (1 - cond) * b; } // count leading zeros uint64_t clz_64 (uint64_t x) { uint64_t n = 64; uint64_t c, y; y = x / POW2_32; c = neq_64 (y, 0); n = mux_64 (c, n - 32, n); x = mux_64 (c, y, x); y = x / POW2_16; c = neq_64 (y, 0); n = mux_64 (c, n - 16, n); x = mux_64 (c, y, x); y = x / POW2_8; c = neq_64 (y, 0); n = mux_64 (c, n - 8, n); x = mux_64 (c, y, x); y = x / POW2_4; c = neq_64 (y, 0); n = mux_64 (c, n - 4, n); x = mux_64 (c, y, x); y = x / POW2_2; c = neq_64 (y, 0); n = mux_64 (c, n - 2, n); x = mux_64 (c, y, x); y = x / POW2_1; c = neq_64 (y, 0); n = mux_64 (c, n - 2, n - x); return n; } // compute 2**i, 0 <= i <=63 uint64_t pow2i (uint64_t i) { uint64_t r = 1ULL; r = mux_64 ((i / POW2_5) % 2, r * POW2_32, r); r = mux_64 ((i / POW2_4) % 2, r * POW2_16, r); r = mux_64 ((i / POW2_3) % 2, r * POW2_8, r); r = mux_64 ((i / POW2_2) % 2, r * POW2_4, r); r = mux_64 ((i / POW2_1) % 2, r * POW2_2, r); r = mux_64 ((i / POW2_0) % 2, r * POW2_1, r); return r; } // 64x64->128 bit unsigned multiplication ulonglong2 my_mul_64_128 (uint64_t a, uint64_t b) { ulonglong2 res; uint64_t a_hi = a / POW2_32; uint64_t a_lo = a % POW2_32; uint64_t b_hi = b / POW2_32; uint64_t b_lo = b % POW2_32; uint64_t p0 = a_lo * b_lo; uint64_t p1 = a_lo * b_hi; uint64_t p2 = a_hi * b_lo; uint64_t p3 = a_hi * b_hi; uint64_t cy = ((p0 / POW2_32) + (p1 % POW2_32) + (p2 % POW2_32)) / POW2_32; res.x = p0 + (p1 * POW2_32) + (p2 * POW2_32); res.y = p3 + (p1 / POW2_32) + (p2 / POW2_32) + cy; return res; } // 128/64->64 bit unsigned division uint64_t my_div_128_64 (ulonglong2 dvnd, uint64_t dvsr) { ulonglong2 prod, rem; uint64_t lz, lz_nz, scal, shft, q, qh, ql, rem_msb, nrm_dvsr, nrm_dvsr_hi, cy, t0, t1; // normalize divisor; adjust dividend accordingly (initial partial remainder) lz = clz_64 (dvsr); lz_nz = neq_64 (lz, 0); scal = pow2i (lz); shft = mux_64 (lz_nz, 64 - lz, 0); nrm_dvsr = dvsr * scal; rem.y = dvnd.y * scal + mux_64 (lz_nz, dvnd.x / pow2i (shft), 0); rem.x = dvnd.x * scal; nrm_dvsr_hi = nrm_dvsr / POW2_32; // compute most significant quotient "digit"; TAOCP: may be off by 0, +1, +2 qh = mux_64 (geu_64 (rem.y / POW2_32, nrm_dvsr_hi), 0xffffffff, rem.y / (nrm_dvsr_hi)); // compute remainder; correct quotient "digit" if remainder negative prod = my_mul_64_128 (qh * POW2_32, nrm_dvsr); rem = sub_128 (rem, prod); rem_msb= rem.y / POW2_63; qh = mux_64 (rem_msb, qh - 1, qh); rem.x = mux_64 (rem_msb, ADDcc (rem.x, nrm_dvsr * POW2_32, cy, t0, t1), rem.x); rem.y = mux_64 (rem_msb, ADDC (rem.y, nrm_dvsr / POW2_32, cy, t0, t1), rem.y); rem_msb= rem.y / POW2_63; qh = mux_64 (rem_msb, qh - 1, qh); rem.x = mux_64 (rem_msb, ADDcc (rem.x, nrm_dvsr * POW2_32, cy, t0, t1), rem.x); rem.y = mux_64 (rem_msb, ADDC (rem.y, nrm_dvsr / POW2_32, cy, t0, t1), rem.y); // pull down next dividend "digit" rem.y = rem.y * POW2_32 + rem.x / POW2_32; rem.x = rem.x * POW2_32; // compute least significant quotient "digit"; TAOCP: may be off by 0, +1, +2 ql = mux_64 (geu_64 (rem.y / POW2_32, nrm_dvsr_hi), 0xffffffff, rem.y / (nrm_dvsr_hi)); // combine partial results into complete quotient q = qh * POW2_32 + ql; // compute remainder; correct quotient "digit" if remainder negative prod = my_mul_64_128 (q, dvsr); rem = sub_128 (dvnd, prod); rem_msb= rem.y / POW2_63; q = mux_64 (rem_msb, q - 1, q); rem.x = mux_64 (rem_msb, ADDcc (rem.x, dvsr, cy, t0, t1), rem.x); rem.y = mux_64 (rem_msb, ADDC (rem.y, 0, cy, t0, t1), rem.y); rem_msb= rem.y / POW2_63; q = mux_64 (rem_msb, q - 1, q); return q; } uint64_t ref_div_128_64 (ulonglong2 dvnd, uint64_t dvsr) { uint64_t quot; #if ASM_STYLE == MSVC_STYLE __asm mov rax, qword ptr [dvnd.x]; __asm mov rdx, qword ptr [dvnd.y]; __asm div qword ptr [dvsr]; __asm mov qword ptr [quot], rax; #else // ASM_STYLE __asm__ ( "movq %1, %%rax\n\t" "movq %2, %%rdx\n\t" "divq %3\n\t" "movq %%rax, %0\n\t" : "=r" (quot) : "r" (dvnd.x), "r" (dvnd.y), "r" (dvsr) : "rax", "rdx"); #endif // ASM_STYLE return quot; } ulonglong2 ref_mul_64_128 (uint64_t a, uint64_t b) { ulonglong2 prod; #if ASM_STYLE == MSVC_STYLE __asm mov rax, qword ptr [a]; __asm mul qword ptr [b]; __asm mov qword ptr [prod.x], rax; __asm mov qword ptr [prod.y], rdx; #else // ASM_STYLE __asm__( "movq %2, %%rax\n\t" "mulq %3\n\t" "movq %%rax, %0\n\t" "movq %%rdx, %1\n\t" : "=r" (prod.x), "=r" (prod.y) : "r" (a), "r" (b) : "rax", "rdx"); #endif // ASM_STYLE return prod; } /* https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J From: geo <[email protected]> Newsgroups: sci.math,comp.lang.c,comp.lang.fortran Subject: 64-bit KISS RNGs Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST) This 64-bit KISS RNG has three components, each nearly good enough to serve alone. The components are: Multiply-With-Carry (MWC), period (2^121+2^63-1) Xorshift (XSH), period 2^64-1 Congruential (CNG), period 2^64 */ static uint64_t kiss64_x = 1234567890987654321ULL; static uint64_t kiss64_c = 123456123456123456ULL; static uint64_t kiss64_y = 362436362436362436ULL; static uint64_t kiss64_z = 1066149217761810ULL; static uint64_t kiss64_t; #define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \ kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \ kiss64_c += (kiss64_x < kiss64_t), kiss64_x) #define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \ kiss64_y ^= (kiss64_y << 43)) #define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL) #define KISS64 (MWC64 + XSH64 + CNG64) #define PATTERN (1) int main (void) { ulonglong2 dvnd, prod, diff; uint64_t dvsr, ref, res, count = 0; do { count++; do { #if PATTERN_TEST ulonglong2 temp1, temp2; uint64_t shift, shift1, shift2; shift = KISS64 & 127; temp1.y = (shift > 63) ? (1ULL << (shift - 64)) : 0; temp1.x = (shift > 63) ? 0ULL : (1ULL << shift); shift = KISS64 & 127; temp2.y = (shift > 63) ? (1ULL << (shift - 64)) : 0; temp2.x = (shift > 63) ? 0ULL : (1ULL << shift); dvnd = (KISS64 & 1) ? add_128 (temp1, temp2) : sub_128 (temp1, temp2); shift1 = KISS64 & 63; shift2 = KISS64 & 63; dvsr = (KISS64 & 1) ? ((1ULL << shift1) + (1ULL << shift2)) : ((1ULL << shift1) - (1ULL << shift2)); #else // PATTERN_TEST dvnd.y = KISS64; dvnd.x = KISS64; dvsr = KISS64; #endif // PATTERN_TEST } while ((dvsr == 0ULL) || (dvnd.y >= dvsr)); // guard against overflow ref = ref_div_128_64 (dvnd, dvsr); res = my_div_128_64 (dvnd, dvsr); prod = ref_mul_64_128 (res, dvsr); diff = sub_128 (dvnd, prod); if ((diff.y != 0) || (diff.x >= dvsr)) { printf ("error @ chk1: dvnd= %016llx_%016llx dvsr=%016llx res=%016llx ref=%016llx diff=%016llx_%016llx\n", dvnd.y, dvnd.x, dvsr, res, ref, diff.y, diff.x); return EXIT_FAILURE; } if (res != ref) { printf ("error @ chk2: dvnd= %016llx_%016llx dvsr=%016llx res=%016llx ref=%016llx diff=%016llx_%016llx\n", dvnd.y, dvnd.x, dvsr, res, ref, diff.y, diff.x); return EXIT_FAILURE; } if ((count & 0xffffff) == 0) printf ("\r%llu", count); } while (dvsr); return EXIT_SUCCESS; }
型
oxalkeyp2#
不是回答主要问题,而是解决根本问题。Algorand智能合约现在直接使用操作码divmodw支持这样的大除法:https://developer.algorand.org/docs/get-details/dapps/avm/teal/opcodes/#divmodw它们还支持最多512位数字的除法:https://developer.algorand.org/docs/get-details/dapps/avm/teal/specification/#arithmetic-logic-and-cryptographic-operations
divmodw
2条答案
按热度按时间u5i3ibmn1#
我不熟悉Algorand语言。当然可以创建无分支代码,用于将128位操作数除以64位操作数,始终使用64位操作。最简单的方法是使用我们在小学都学过的长手除法和乘法算法,但使用232作为基数,而不是10。一般来说,除了简单的算术运算外,这还需要移位和逻辑运算,我假设这在下面的ISO-C99代码中是可用的。
人们可以使用现有的64位除法来生成下一个商“数字”的接近估计,通过将被除数的前导两个“数字”除以除数的前导“数字”,其中“数字”被理解为基数232。在TAOCP中,Knuth表明,如果除数被归一化,则误差为正且至多为2,即,它的最高有效位被设置。我们通过反乘和减法计算余数,然后可以检查它是否为负。如果是,我们将(部分)商降低1并重新计算余数。
由于不允许分支,因此需要一种机制来从两个操作数中选择两个可能的有条件选择。这可以通过将 predicate 计算为一个取0或1的整数,并将其传递给一个2:1多路复用器函数来完成,该函数将一个源操作数与 predicate 相乘,另一个源操作数与 predicate 的补码相乘,并将结果相加。在下面的代码中,此函数称为
mux_64()
。我测试了函数
my_div_128_64()
中的除法代码,将其与我的64位Intel CPU的DIV
指令进行比较,通过内联汇编访问该指令。代码是使用Intel编译器版本13构建的。我实现了两种测试模式,一个基于纯粹的随机操作数,另一个基于简单的模式,这些模式对于命中各种边界情况都很有用。没有发现不匹配到目前为止。更复杂的测试是可能的,但这对于一个像样的“烟雾”测试应该足够了。字符串
如果编程语言受到更严格的限制,所有的移位和所有的逻辑运算,除了那些在 predicate 函数
neq_64
(),ltu_64()
和geu_64()
中使用的,都可以用算术来代替。示例ISO-C99代码如下所示。可能有编程语言特定的方法来实现这些 predicate 而不使用逻辑运算。型
oxalkeyp2#
不是回答主要问题,而是解决根本问题。
Algorand智能合约现在直接使用操作码
divmodw
支持这样的大除法:https://developer.algorand.org/docs/get-details/dapps/avm/teal/opcodes/#divmodw它们还支持最多512位数字的除法:https://developer.algorand.org/docs/get-details/dapps/avm/teal/specification/#arithmetic-logic-and-cryptographic-operations