ยังไงเพื่อแบ่งเป็น 128-bit dividend โดย 64-bit divisor อยู่ที่ไหน dividend เป็นชิ้นเป็น 1,และที่ไหนที่ฉันแค่ต้องการที่ 64 LSBs ของ quotient?

Question 1

ฉันต้องคำนวณ ((2^128) - 1) / x. ที่ divisor, xคือการไม่ต้องเซ็น 64-นิดหมายเลข ที่ dividend คือเขียนของสองคนไม่ต้องเซ็น 64-กัดตัวเลข(สูงและต่ำ)อยู่ไหนทั้งสองตัวเลข UINT64_MAX. ฉันสามารถใช้ 64-ดรวมอัตโนมัติและต้องการให้เป็นแบบเคลื่อนย้ายได้(ไม่มีประโยชน์หรอกของ GNU น __int128,MSCV น _udiv128,โรงเรีหรืออะไรแบบนั้น). ฉันไม่ต้องการคนสูงส่วนหนึ่งของ quotient ฉันแค่ต้องการลบ 64 บิต.

ฉันต้องทำยังปฏิบัติการนี้?

ยัง: x >= 3, x ไม่ใช่พลังของ 2 ต่อได้.

แก้ไข:ฉันสร้างของตัวเองทาง(ตอบด้านล่างนี้). แต่ฉันยินดีต้อนรัอื่นทางที่ performs ดีกว่า:)

Question 2

ฉันไม่รู้เรื่องอะไร optimizations นั้นใช้กับการหารจำนวนเต็มกับเป็นอย่างต่อเนื่อง dividend. เพื่อย้อนไปดูผมพยายามทดสอบคดีกับการทั้งหมดที่ dividend กับ คอมไพเลอร์เอ็กซ์พลอเรอร์. ใช้ gcc,icc และ clang กับสูงสุด optimization ระดับกำหนดที่ถูกสร้างขึ้นรหัสแสดงให้เห็นว่าไม่มี optimizations การสมัครไปที่ดิวิชั่น

มันเป็นอย่างแน่นเป็นไปได้ที่จะสร้างอสูรแสดง 128-หน่อดิวิชั่นกิจวัตรประจำวันแต่จากประสบการณ์ส่วนตัวฉันรู้ว่านี่มันค่อนข้างเกิดข้อผิดพลาดมีแนวโน้มและค่อนข้างซับซ้อนการทดสอบคือต้องการที่จะประสบความสำเร็จดีทดสอบรายงานข่าวรวมถึงหัวมุมคดีอย่างที่ exhaustive ทดสอบเนื่องจากเป็นไปไม่ได้ที่มันตอนนี้ operand ยขนาดด้วย ความพยายามสำหรับการออกแบบทดสอบอย่างง่ายดาย exceeds สิ่งที่ดูเหมือนเหตุผลสำหรับคำตอบอยู่ Stackoverflow โดยสองทศนิยมคำสั่งของแม็กนิจูดอย่างงั้นล่ะ

วิธีง่ายๆที่จะแสดงจำนวนเต็มดิวิชั่นจะใช้อัลกอริธึมเราทุกคนได้เรียนรู้ในสมัยเรียนมหาวิทยาลัยเดียวในฐานสอง. นี่จะทำให้การตัดสินใจเรื่องต่อ quotient นกัดโดยเฉพาะอย่างง่าย:มันคือ 1 ตอนที่ตัวปัจจุบันบางส่วนที่เหลือทั้งหมดคือสิ่งที่ยิ่งใหญ่กว่าหรือเท่ากับที่ divisor และ 0 นเป็นอย่างอื่น ใช้ longhand ฐานสองดิวิชั่นเดียวที่จำนวนเต็มปฏิบัติการเราต้องการเป็น additions และ subtractions.

เราสามารถสร้างแบบเคลื่อนย้ายได้ primitives สำหรับการแสดงพวกนี้อยู่ operands ของได้กัดความยาวโดยพยายามเลียนแบบที่ทางหน่วยประมวลผล name เป็นเครื่องคำแนะนำเป็นเคยลองปฏิบัติการณ์ในหลาย-คำ integers:เพิ่มกับถือ-ออกมาเพิ่มกับถืออยู่รวมกับถือในพร้อมกับถือเจ้า-ออกไป;ที่คล้ายคลึงกันสำหรับตัวรองอ่ะ เข้ารหัสทางด้านล่าฉันใช้ง่ายอย่าง C เพิ่มมาสก์สำหรับนั่น;อย่างแน่นซับซ้อนมากขึ้นมาเป็นไปได้

ตั้งแต่ระบบฉันกำลังทำอยู่ตอนนี้ไม่มีรสนับสนุนอย่างมากสำหรั 128-bit integers ฉัน prototyped และทดสอบนี่เข้าไปสำหรับ 64-bit integers. ที่ 128-หน่อรุ่นนั้นเป็นการออกกำลังอยู่ในเรียบง่ายเครื่องจักรเปลี่ยนชื่อ. ในสมัยใหม่ 64-กัดหน่วยประมวลผล name ฉันจะคาดหวังนี้ 128-หน่อดิวิชั่นฟังก์ชันที่จะประมวลผลในเวลาอย่างน้อย 3000 ใช้ได้แต่รอบ.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <limits.h>

#define SUBCcc(a,b,cy,t0,t1,t2) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)

#define SUBcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), cy=t1<t0, t1-t0)

#define SUBC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t1-t0)

#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

typedef struct {
    uint64_t l;
    uint64_t h;
} my_uint128;

my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
    my_uint128 quot, rem, tmp;
    uint64_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (my_uint128);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

typedef struct {
    uint32_t l;
    uint32_t h;
} my_uint64;

my_uint64 bitwise_division_64 (my_uint64 dvnd, my_uint64 dvsr)
{
    my_uint64 quot, rem, tmp;
    uint32_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (my_uint64);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    uint64_t a, b, res, ref;
    my_uint64 aa, bb, rr;
    do {
        a = KISS64;
        b = KISS64;
        ref = a / b;

        aa.l = (uint32_t)a;
        aa.h = (uint32_t)(a >> 32);
        bb.l = (uint32_t)b;
        bb.h = (uint32_t)(b >> 32);
        rr = bitwise_division_64 (aa, bb);
        res = (((uint64_t)rr.h) << 32) + rr.l;

        if (ref != res) {
            printf ("a=%016llx b=%016llx res=%016llx ref=%016llx\n", a, b, res, ref);
            return EXIT_FAILURE;
        }
    } while (a);
    return EXIT_SUCCESS;
}

เป็เร็วเข้าใกล้กว่ากัด-ที่ฉลาด computation คือต้องสะทกสะทานเลยละสิงเลขกลับส่วนของ divisor นธุ์โดย dividend ผลลัพธ์จากอยู่ในเบื้องต้น quotient งั้นสะทกสะทานเลยละสิที่เหลือทั้งหมดที่ถูกต้องปรับค quotient. ทั้งหมด computation สามารถสำเร็จลุล่วงอยู่ในซ่อมจุดรวมอัตโนมัติ. อย่างไรก็ตามในสมัยใหม่ตัวประมวลผลด้วดเร็วตัวเองลอยเคว้งคว้า-ประเด็นหน่วยมันเป็นมากกว่าสะดวกสร้างเริ่ม approximation สำหรับเลขกลับส่วนกับเป็นสองจำนวนทศนิยมดิวิชั่น นัดเดียว Halley ทำซ้ำกับลูกบาศก์ convergence งั้นผลตรวจอยู่เต็มความแม่นยำเลขกลับส่วน.

ที่ Halley ทำซ้ำสำหรับเลขกลับส่วนมาจำนวนเต็มคูณเจาะตลาดกลุ่มด้วย 64x64-หน่อเพิ่มพูนขึ้น\กับ 128-กัดผล(umul64wide() เข้ารหัสด้านล่างนี้)เป็นตึกบล็อกเรื่องสำคัญต้องการแสดง. ในสมัยใหม่ 64-bit architectures นี้เป็นโอกาสเดียวเครื่อง instruction ระหว่างประมวลผลในอีกไม่กี่รอบ:อย่างไรก็ตามนี่คือไม่สามารถเข้าใช้ไปแบบเคลื่อนย้ายได้รหัส แบบเคลื่อนย้ายได้รหัส emulating ที่ instruction ต้องการเรื่อง 15 ถึง 20 คำสั่งขึ้นอยู่กับสถาปัตยกรรม\n และคอมไพเลอร์.

ทั้งหมด 128-หน่อดิวิชั่นควรจะใช้เวลาประมาณ 300 รอบ:หรือสิบครั้งเร็วที่สุดเท่าที่คนง่ายหน่อ-ฉลาด computation. เพราะว่ารหัสเป็นที่ค่อนข้างซับซ้อนมันต้องการสิ่งสำคัญจำนวนมากของการทดสอบเพื่อให้แน่ใจว่าถูกต้องดำเนินงานการ)กิบัติงานอยู่ ในเฟรมเวิร์กด้านล่างฉันโดยใช้รูปแบบพื้นฐานและการสุ่มการทดสอบสำหรับ moderately ซึ่งการทดสอบโดยตรงไปตรงมากัด-ที่ฉลาด implementation อเป็นบุคคลอ้างอิงหรอก

คนของ implementation udiv128() ด้านล่างคิดเอาเองว่าคนตั้งโปรแกรม enviornment ใช้ IEEE-754 compliant ตัวเองลอยเคว้งคว้าจุดรวมอัตโนมัตินั่น double ประเภทคือแผนไว้ต้อง IEEE-754 น binary64 ประเภทและนั่นของดิวิชั่น double operands เป็นอย่างถูกต้อง rounded.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>

typedef struct {
    uint64_t l;
    uint64_t h;
} my_uint128;

my_uint128 make_my_uint128 (uint64_t h, uint64_t l);
my_uint128 add128 (my_uint128 a, my_uint128 b);
my_uint128 sub128 (my_uint128 a, my_uint128 b);
my_uint128 lsl128 (my_uint128 a, int sh);
my_uint128 lsr128 (my_uint128 a, int sh);
my_uint128 not128 (my_uint128 a);
my_uint128 umul128lo (my_uint128 a, my_uint128 b);
my_uint128 umul128hi (my_uint128 a, my_uint128 b);
double my_uint128_to_double (my_uint128 a);
int lt128 (my_uint128 a, my_uint128 b);
int eq128 (my_uint128 a, my_uint128 b);
uint64_t double_as_uint64 (double a);
double uint64_as_double (uint64_t a);

#define FP64_EXPO_BIAS   (1023)
#define FP64_MANT_BITS   (53)
#define FP64_MANT_IBIT   (0x0010000000000000ULL)
#define FP64_MANT_MASK   (0x000fffffffffffffULL)
#define FP64_INC_EXP_128 (0x0800000000000000ULL)
#define FP64_MANT_ADJ    (2)  // adjustment to ensure underestimate

my_uint128 udiv128 (my_uint128 dividend, my_uint128 divisor)
{
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    const my_uint128 one  = make_my_uint128 (0ULL, 1ULL);
    const my_uint128 two  = make_my_uint128 (0ULL, 2ULL);
    my_uint128 recip, temp, quo, rem;
    my_uint128 neg_divisor = sub128 (zero, divisor);
    double r;

    /* compute initial approximation for reciprocal; must be underestimate! */
    r = 1.0 / my_uint128_to_double (divisor);
    uint64_t i = double_as_uint64 (r) - FP64_MANT_ADJ + FP64_INC_EXP_128;
    temp = make_my_uint128 (0ULL, (i & FP64_MANT_MASK) | FP64_MANT_IBIT);
    int sh = (i >> (FP64_MANT_BITS-1)) - FP64_EXPO_BIAS - (FP64_MANT_BITS-1);
    recip = (sh < 0) ? lsr128 (temp, -sh) : lsl128 (temp, sh);

    /* perform Halley iteration with cubic convergence to refine reciprocal */
    temp = umul128lo (neg_divisor, recip);
    temp = add128 (umul128hi (temp, temp), temp);
    recip = add128 (umul128hi (recip, temp), recip);

    /* compute preliminary quotient and remainder */
    quo = umul128hi (dividend, recip); 
    rem = sub128 (dividend, umul128lo (divisor, quo));

    /* adjust quotient if too small; quotient off by 2 at most */
    if (! lt128 (rem, divisor)) {
        quo = add128 (quo, lt128 (sub128 (rem, divisor), divisor) ? one : two);
    }

    /* handle division by zero */
    if (eq128 (divisor, zero)) quo = not128 (zero);

    return quo;
}

#define SUBCcc(a,b,cy,t0,t1,t2) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)

#define SUBcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), cy=t1<t0, t1-t0)

#define SUBC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t1-t0)

#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

uint64_t double_as_uint64 (double a) 
{ 
    uint64_t r; 
    memcpy (&r, &a, sizeof r); 
    return r; 
}

double uint64_as_double (uint64_t a) 
{ 
    double r; 
    memcpy (&r, &a, sizeof r); 
    return r; 
}

my_uint128 add128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1;
    a.l = ADDcc (a.l, b.l, cy, t0, t1);
    a.h = ADDC  (a.h, b.h, cy, t0, t1);
    return a;
}

my_uint128 sub128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1;
    a.l = SUBcc (a.l, b.l, cy, t0, t1);
    a.h = SUBC  (a.h, b.h, cy, t0, t1);
    return a;
}

my_uint128 lsl128 (my_uint128 a, int sh)
{
    if (sh >= 64) {
        a.h = a.l << (sh - 64);
        a.l = 0ULL;
    } else if (sh) {
        a.h = (a.h << sh) + (a.l >> (64 - sh));
        a.l = a.l << sh;
    }
    return a;
}

my_uint128 lsr128 (my_uint128 a, int sh)
{
    if (sh >= 64) {
        a.l = a.h >> (sh - 64);
        a.h = 0ULL;
    } else if (sh) {
        a.l = (a.l >> sh) + (a.h << (64 - sh));
        a.h = a.h >> sh;
    } 
    return a;
}

my_uint128 not128 (my_uint128 a)
{
    a.l = ~a.l;
    a.h = ~a.h;
    return a;
}

int lt128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1, t2;
    a.l = SUBcc  (a.l, b.l, cy, t0, t1);
    a.h = SUBCcc (a.h, b.h, cy, t0, t1, t2);
    return cy;
}

int eq128 (my_uint128 a, my_uint128 b)
{
    return (a.l == b.l) && (a.h == b.h);
}

// derived from Hacker's Delight 2nd ed. figure 8-2
my_uint128 umul64wide (uint64_t u, uint64_t v)
{
    my_uint128 r;
    uint64_t u0, v0, u1, v1, w0, w1, w2, t;
    u0 = (uint32_t)u;  u1 = u >> 32;
    v0 = (uint32_t)v;  v1 = v >> 32;
    w0 = u0 * v0;
    t  = u1 * v0 + (w0 >> 32);
    w1 = (uint32_t)t;
    w2 = t >> 32;
    w1 = u0 * v1 + w1;
    r.h = u1 * v1 + w2 + (w1 >> 32);
    r.l = (w1 << 32) + (uint32_t)w0;
    return r;
}

my_uint128 make_my_uint128 (uint64_t h, uint64_t l)
{
    my_uint128 r;
    r.h = h;
    r.l = l;
    return r;
}

my_uint128 umul128lo (my_uint128 a, my_uint128 b)
{
    my_uint128 r;
    r = umul64wide (a.l, b.l);
    r.h = r.h + a.l * b.h + a.h * b.l;
    return r;
}

my_uint128 umul128hi (my_uint128 a, my_uint128 b)
{
    my_uint128 t0, t1, t2, t3;
    t0 = umul64wide (a.l, b.l);
    t3 = add128 (umul64wide (a.h, b.l), make_my_uint128 (0ULL, t0.h));
    t1 = make_my_uint128 (0ULL, t3.l);
    t2 = make_my_uint128 (0ULL, t3.h);
    t1 = add128 (umul64wide (a.l, b.h), t1);
    return add128 (add128 (umul64wide (a.h, b.h), t2), make_my_uint128 (0ULL, t1.h));
}

double my_uint128_to_double (my_uint128 a)
{
    const int intbits = sizeof (a) * CHAR_BIT;
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    my_uint128 rnd, i = a;
    uint64_t j;
    int sh = 0;
    double r;

    // normalize integer so MSB is set
    if (lt128 (i, make_my_uint128(0x0000000000000001ULL, 0))) {i = lsl128 (i,64); sh += 64; }
    if (lt128 (i, make_my_uint128(0x0000000100000000ULL, 0))) {i = lsl128 (i,32); sh += 32; }
    if (lt128 (i, make_my_uint128(0x0001000000000000ULL, 0))) {i = lsl128 (i,16); sh += 16; }
    if (lt128 (i, make_my_uint128(0x0100000000000000ULL, 0))) {i = lsl128 (i, 8); sh +=  8; } 
    if (lt128 (i, make_my_uint128(0x1000000000000000ULL, 0))) {i = lsl128 (i, 4); sh +=  4; }
    if (lt128 (i, make_my_uint128(0x4000000000000000ULL, 0))) {i = lsl128 (i, 2); sh +=  2; }
    if (lt128 (i, make_my_uint128(0x8000000000000000ULL, 0))) {i = lsl128 (i, 1); sh +=  1; }
    // form mantissa with explicit integer bit 
    rnd = lsl128 (i, FP64_MANT_BITS);
    i = lsr128 (i, intbits - FP64_MANT_BITS);
    j = i.l;
    // add in exponent, taking into account integer bit of mantissa
    if (! eq128 (a, zero)) {
        j += (uint64_t)(FP64_EXPO_BIAS + (intbits-1) - 1 - sh) << (FP64_MANT_BITS-1);
    }
    // round to nearest or even
    rnd.h = rnd.h | (rnd.l != 0);
    if ((rnd.h > 0x8000000000000000ULL) || 
        ((rnd.h == 0x8000000000000000ULL) && (j & 1))) j++;
    // reinterpret bit pattern as IEEE-754 'binary64'
    r = uint64_as_double (j);
    return r;
}

my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
    my_uint128 quot, rem, tmp;
    uint64_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (dvsr);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

my_uint128 v[100000]; /* FIXME: size appropriately */

int main (void)
{
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    const my_uint128 one = make_my_uint128 (0ULL, 1ULL);
    my_uint128 dividend, divisor, quot, ref;
    int i, j, patterns, idx = 0, nbrBits = sizeof (v[0]) * CHAR_BIT;
    int patterns_done = 0;

    /* pattern class 1: 2**i */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = lsl128 (one, i);
        idx++;
    }
    /* pattern class 2: 2**i-1 */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = sub128 (lsl128 (one, i), one);
        idx++;
    }
    /* pattern class 3: 2**i+1 */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = add128 (lsl128 (one, i), one); 
        idx++;
    }
    /* pattern class 4: 2**i + 2**j */
    for (i = 0; i < nbrBits; i++) {
        for (j = 0; j < nbrBits; j++) {
            v [idx] = add128 (lsl128 (one, i), lsl128 (one, j));
            idx++;
        }
    }
    /* pattern class 5: 2**i - 2**j */
    for (i = 0; i < nbrBits; i++) {
        for (j = 0; j < nbrBits; j++) {
            v [idx] = sub128 (lsl128 (one, i), lsl128 (one, j));
            idx++;
        }
    }
    patterns = idx;
    /* pattern class 6: one's complement of pattern classes 1 through 5 */
    for (i = 0; i < patterns; i++) {
        v [idx] = not128 (v [i]);
        idx++;
    }
    /* pattern class 7: two's complement of pattern classes 1 through 5 */
    for (i = 0; i < patterns; i++) {
        v [idx] = sub128 (zero, v[i]);
        idx++;
    }
    patterns = idx;
    printf ("Starting pattern-based tests. Number of patterns: %d\n", patterns);

    for (long long int k = 0; k < 100000000000LL; k++) {
        if (k < patterns * patterns) {
            dividend = v [k / patterns];
            divisor  = v [k % patterns];
        } else {
            if (!patterns_done) {
                printf ("Starting random tests\n");
                patterns_done = 1;
            }
            dividend.l = KISS64;
            dividend.h = KISS64;
            divisor.h  = KISS64;
            divisor.l  = KISS64;
        }
        /* exclude cases with undefined results: division by zero */
        if (! eq128 (divisor, zero)) {
            quot = udiv128 (dividend, divisor);
            ref = bitwise_division_128 (dividend, divisor);
            if (! eq128 (quot, ref)) {
                printf ("@ (%016llx_%016llx, %016llx_%016llx): quot = %016llx_%016llx  ref=%016llx_%016llx\n", 
                        dividend.h, dividend.l, divisor.h, divisor.l, 
                        quot.h, quot.l, ref.h, ref.l);
                return EXIT_FAILURE;
            }
        }
    }
    printf ("unsigned 128-bit division: tests passed\n");
    return EXIT_SUCCESS;
}

Question 3

นี่คือสิ่งที่ฉันต้องลงเอยถูกเข้ารหัสฉุกเฉิน ฉันมั่นใจว่ามีมากแค่ไหนเร็วขึ้นอื่นทำได้แล้แต่อย่างน้อยนี่คืองาน

จากพื้นฐาน: https://en.wikipedia.org/wiki/Division_algorithm#Integer_division_(ไม่ต้องเซ็น)_with_remainder. พัฒนาเป็นสำหรับเรื่องนี้โดยเฉพาะใช้-ดี

// q = (2^128 - 1) / d, where q is the 64 LSBs of the quotient
uint64_t two_pow_128_minus_1_div_d(uint64_t d) {
    uint64_t q = 0, r_hi = 0, r_lo = 0;

    for (int i = 127; i >= 0; --i) {
        r_hi = (r_hi << 1) | (r_lo >> 63);
        r_lo <<= 1;

        r_lo |= 1UL;

        if (r_hi || r_lo >= d) {
            const uint64_t borrow = d > r_lo;
            r_lo -= d;
            r_hi -= borrow;

            if (i < 64)
                q |= 1UL << i;
        }
    }
    return q;
}

njuffa · Answer 1 · 2021-11-22T01:55:43

ฉันไม่รู้เรื่องอะไร optimizations นั้นใช้กับการหารจำนวนเต็มกับเป็นอย่างต่อเนื่อง dividend. เพื่อย้อนไปดูผมพยายามทดสอบคดีกับการทั้งหมดที่ dividend กับ คอมไพเลอร์เอ็กซ์พลอเรอร์. ใช้ gcc,icc และ clang กับสูงสุด optimization ระดับกำหนดที่ถูกสร้างขึ้นรหัสแสดงให้เห็นว่าไม่มี optimizations การสมัครไปที่ดิวิชั่น

มันเป็นอย่างแน่นเป็นไปได้ที่จะสร้างอสูรแสดง 128-หน่อดิวิชั่นกิจวัตรประจำวันแต่จากประสบการณ์ส่วนตัวฉันรู้ว่านี่มันค่อนข้างเกิดข้อผิดพลาดมีแนวโน้มและค่อนข้างซับซ้อนการทดสอบคือต้องการที่จะประสบความสำเร็จดีทดสอบรายงานข่าวรวมถึงหัวมุมคดีอย่างที่ exhaustive ทดสอบเนื่องจากเป็นไปไม่ได้ที่มันตอนนี้ operand ยขนาดด้วย ความพยายามสำหรับการออกแบบทดสอบอย่างง่ายดาย exceeds สิ่งที่ดูเหมือนเหตุผลสำหรับคำตอบอยู่ Stackoverflow โดยสองทศนิยมคำสั่งของแม็กนิจูดอย่างงั้นล่ะ

วิธีง่ายๆที่จะแสดงจำนวนเต็มดิวิชั่นจะใช้อัลกอริธึมเราทุกคนได้เรียนรู้ในสมัยเรียนมหาวิทยาลัยเดียวในฐานสอง. นี่จะทำให้การตัดสินใจเรื่องต่อ quotient นกัดโดยเฉพาะอย่างง่าย:มันคือ 1 ตอนที่ตัวปัจจุบันบางส่วนที่เหลือทั้งหมดคือสิ่งที่ยิ่งใหญ่กว่าหรือเท่ากับที่ divisor และ 0 นเป็นอย่างอื่น ใช้ longhand ฐานสองดิวิชั่นเดียวที่จำนวนเต็มปฏิบัติการเราต้องการเป็น additions และ subtractions.

เราสามารถสร้างแบบเคลื่อนย้ายได้ primitives สำหรับการแสดงพวกนี้อยู่ operands ของได้กัดความยาวโดยพยายามเลียนแบบที่ทางหน่วยประมวลผล name เป็นเครื่องคำแนะนำเป็นเคยลองปฏิบัติการณ์ในหลาย-คำ integers:เพิ่มกับถือ-ออกมาเพิ่มกับถืออยู่รวมกับถือในพร้อมกับถือเจ้า-ออกไป;ที่คล้ายคลึงกันสำหรับตัวรองอ่ะ เข้ารหัสทางด้านล่าฉันใช้ง่ายอย่าง C เพิ่มมาสก์สำหรับนั่น;อย่างแน่นซับซ้อนมากขึ้นมาเป็นไปได้

ตั้งแต่ระบบฉันกำลังทำอยู่ตอนนี้ไม่มีรสนับสนุนอย่างมากสำหรั 128-bit integers ฉัน prototyped และทดสอบนี่เข้าไปสำหรับ 64-bit integers. ที่ 128-หน่อรุ่นนั้นเป็นการออกกำลังอยู่ในเรียบง่ายเครื่องจักรเปลี่ยนชื่อ. ในสมัยใหม่ 64-กัดหน่วยประมวลผล name ฉันจะคาดหวังนี้ 128-หน่อดิวิชั่นฟังก์ชันที่จะประมวลผลในเวลาอย่างน้อย 3000 ใช้ได้แต่รอบ.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <limits.h>

#define SUBCcc(a,b,cy,t0,t1,t2) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)

#define SUBcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), cy=t1<t0, t1-t0)

#define SUBC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t1-t0)

#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

typedef struct {
    uint64_t l;
    uint64_t h;
} my_uint128;

my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
    my_uint128 quot, rem, tmp;
    uint64_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (my_uint128);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

typedef struct {
    uint32_t l;
    uint32_t h;
} my_uint64;

my_uint64 bitwise_division_64 (my_uint64 dvnd, my_uint64 dvsr)
{
    my_uint64 quot, rem, tmp;
    uint32_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (my_uint64);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    uint64_t a, b, res, ref;
    my_uint64 aa, bb, rr;
    do {
        a = KISS64;
        b = KISS64;
        ref = a / b;

        aa.l = (uint32_t)a;
        aa.h = (uint32_t)(a >> 32);
        bb.l = (uint32_t)b;
        bb.h = (uint32_t)(b >> 32);
        rr = bitwise_division_64 (aa, bb);
        res = (((uint64_t)rr.h) << 32) + rr.l;

        if (ref != res) {
            printf ("a=%016llx b=%016llx res=%016llx ref=%016llx\n", a, b, res, ref);
            return EXIT_FAILURE;
        }
    } while (a);
    return EXIT_SUCCESS;
}

เป็เร็วเข้าใกล้กว่ากัด-ที่ฉลาด computation คือต้องสะทกสะทานเลยละสิงเลขกลับส่วนของ divisor นธุ์โดย dividend ผลลัพธ์จากอยู่ในเบื้องต้น quotient งั้นสะทกสะทานเลยละสิที่เหลือทั้งหมดที่ถูกต้องปรับค quotient. ทั้งหมด computation สามารถสำเร็จลุล่วงอยู่ในซ่อมจุดรวมอัตโนมัติ. อย่างไรก็ตามในสมัยใหม่ตัวประมวลผลด้วดเร็วตัวเองลอยเคว้งคว้า-ประเด็นหน่วยมันเป็นมากกว่าสะดวกสร้างเริ่ม approximation สำหรับเลขกลับส่วนกับเป็นสองจำนวนทศนิยมดิวิชั่น นัดเดียว Halley ทำซ้ำกับลูกบาศก์ convergence งั้นผลตรวจอยู่เต็มความแม่นยำเลขกลับส่วน.

ที่ Halley ทำซ้ำสำหรับเลขกลับส่วนมาจำนวนเต็มคูณเจาะตลาดกลุ่มด้วย 64x64-หน่อเพิ่มพูนขึ้น\กับ 128-กัดผล(umul64wide() เข้ารหัสด้านล่างนี้)เป็นตึกบล็อกเรื่องสำคัญต้องการแสดง. ในสมัยใหม่ 64-bit architectures นี้เป็นโอกาสเดียวเครื่อง instruction ระหว่างประมวลผลในอีกไม่กี่รอบ:อย่างไรก็ตามนี่คือไม่สามารถเข้าใช้ไปแบบเคลื่อนย้ายได้รหัส แบบเคลื่อนย้ายได้รหัส emulating ที่ instruction ต้องการเรื่อง 15 ถึง 20 คำสั่งขึ้นอยู่กับสถาปัตยกรรม\n และคอมไพเลอร์.

ทั้งหมด 128-หน่อดิวิชั่นควรจะใช้เวลาประมาณ 300 รอบ:หรือสิบครั้งเร็วที่สุดเท่าที่คนง่ายหน่อ-ฉลาด computation. เพราะว่ารหัสเป็นที่ค่อนข้างซับซ้อนมันต้องการสิ่งสำคัญจำนวนมากของการทดสอบเพื่อให้แน่ใจว่าถูกต้องดำเนินงานการ)กิบัติงานอยู่ ในเฟรมเวิร์กด้านล่างฉันโดยใช้รูปแบบพื้นฐานและการสุ่มการทดสอบสำหรับ moderately ซึ่งการทดสอบโดยตรงไปตรงมากัด-ที่ฉลาด implementation อเป็นบุคคลอ้างอิงหรอก

คนของ implementation udiv128() ด้านล่างคิดเอาเองว่าคนตั้งโปรแกรม enviornment ใช้ IEEE-754 compliant ตัวเองลอยเคว้งคว้าจุดรวมอัตโนมัตินั่น double ประเภทคือแผนไว้ต้อง IEEE-754 น binary64 ประเภทและนั่นของดิวิชั่น double operands เป็นอย่างถูกต้อง rounded.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>

typedef struct {
    uint64_t l;
    uint64_t h;
} my_uint128;

my_uint128 make_my_uint128 (uint64_t h, uint64_t l);
my_uint128 add128 (my_uint128 a, my_uint128 b);
my_uint128 sub128 (my_uint128 a, my_uint128 b);
my_uint128 lsl128 (my_uint128 a, int sh);
my_uint128 lsr128 (my_uint128 a, int sh);
my_uint128 not128 (my_uint128 a);
my_uint128 umul128lo (my_uint128 a, my_uint128 b);
my_uint128 umul128hi (my_uint128 a, my_uint128 b);
double my_uint128_to_double (my_uint128 a);
int lt128 (my_uint128 a, my_uint128 b);
int eq128 (my_uint128 a, my_uint128 b);
uint64_t double_as_uint64 (double a);
double uint64_as_double (uint64_t a);

#define FP64_EXPO_BIAS   (1023)
#define FP64_MANT_BITS   (53)
#define FP64_MANT_IBIT   (0x0010000000000000ULL)
#define FP64_MANT_MASK   (0x000fffffffffffffULL)
#define FP64_INC_EXP_128 (0x0800000000000000ULL)
#define FP64_MANT_ADJ    (2)  // adjustment to ensure underestimate

my_uint128 udiv128 (my_uint128 dividend, my_uint128 divisor)
{
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    const my_uint128 one  = make_my_uint128 (0ULL, 1ULL);
    const my_uint128 two  = make_my_uint128 (0ULL, 2ULL);
    my_uint128 recip, temp, quo, rem;
    my_uint128 neg_divisor = sub128 (zero, divisor);
    double r;

    /* compute initial approximation for reciprocal; must be underestimate! */
    r = 1.0 / my_uint128_to_double (divisor);
    uint64_t i = double_as_uint64 (r) - FP64_MANT_ADJ + FP64_INC_EXP_128;
    temp = make_my_uint128 (0ULL, (i & FP64_MANT_MASK) | FP64_MANT_IBIT);
    int sh = (i >> (FP64_MANT_BITS-1)) - FP64_EXPO_BIAS - (FP64_MANT_BITS-1);
    recip = (sh < 0) ? lsr128 (temp, -sh) : lsl128 (temp, sh);

    /* perform Halley iteration with cubic convergence to refine reciprocal */
    temp = umul128lo (neg_divisor, recip);
    temp = add128 (umul128hi (temp, temp), temp);
    recip = add128 (umul128hi (recip, temp), recip);

    /* compute preliminary quotient and remainder */
    quo = umul128hi (dividend, recip); 
    rem = sub128 (dividend, umul128lo (divisor, quo));

    /* adjust quotient if too small; quotient off by 2 at most */
    if (! lt128 (rem, divisor)) {
        quo = add128 (quo, lt128 (sub128 (rem, divisor), divisor) ? one : two);
    }

    /* handle division by zero */
    if (eq128 (divisor, zero)) quo = not128 (zero);

    return quo;
}

#define SUBCcc(a,b,cy,t0,t1,t2) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)

#define SUBcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), cy=t1<t0, t1-t0)

#define SUBC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t1-t0)

#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

uint64_t double_as_uint64 (double a) 
{ 
    uint64_t r; 
    memcpy (&r, &a, sizeof r); 
    return r; 
}

double uint64_as_double (uint64_t a) 
{ 
    double r; 
    memcpy (&r, &a, sizeof r); 
    return r; 
}

my_uint128 add128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1;
    a.l = ADDcc (a.l, b.l, cy, t0, t1);
    a.h = ADDC  (a.h, b.h, cy, t0, t1);
    return a;
}

my_uint128 sub128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1;
    a.l = SUBcc (a.l, b.l, cy, t0, t1);
    a.h = SUBC  (a.h, b.h, cy, t0, t1);
    return a;
}

my_uint128 lsl128 (my_uint128 a, int sh)
{
    if (sh >= 64) {
        a.h = a.l << (sh - 64);
        a.l = 0ULL;
    } else if (sh) {
        a.h = (a.h << sh) + (a.l >> (64 - sh));
        a.l = a.l << sh;
    }
    return a;
}

my_uint128 lsr128 (my_uint128 a, int sh)
{
    if (sh >= 64) {
        a.l = a.h >> (sh - 64);
        a.h = 0ULL;
    } else if (sh) {
        a.l = (a.l >> sh) + (a.h << (64 - sh));
        a.h = a.h >> sh;
    } 
    return a;
}

my_uint128 not128 (my_uint128 a)
{
    a.l = ~a.l;
    a.h = ~a.h;
    return a;
}

int lt128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1, t2;
    a.l = SUBcc  (a.l, b.l, cy, t0, t1);
    a.h = SUBCcc (a.h, b.h, cy, t0, t1, t2);
    return cy;
}

int eq128 (my_uint128 a, my_uint128 b)
{
    return (a.l == b.l) && (a.h == b.h);
}

// derived from Hacker's Delight 2nd ed. figure 8-2
my_uint128 umul64wide (uint64_t u, uint64_t v)
{
    my_uint128 r;
    uint64_t u0, v0, u1, v1, w0, w1, w2, t;
    u0 = (uint32_t)u;  u1 = u >> 32;
    v0 = (uint32_t)v;  v1 = v >> 32;
    w0 = u0 * v0;
    t  = u1 * v0 + (w0 >> 32);
    w1 = (uint32_t)t;
    w2 = t >> 32;
    w1 = u0 * v1 + w1;
    r.h = u1 * v1 + w2 + (w1 >> 32);
    r.l = (w1 << 32) + (uint32_t)w0;
    return r;
}

my_uint128 make_my_uint128 (uint64_t h, uint64_t l)
{
    my_uint128 r;
    r.h = h;
    r.l = l;
    return r;
}

my_uint128 umul128lo (my_uint128 a, my_uint128 b)
{
    my_uint128 r;
    r = umul64wide (a.l, b.l);
    r.h = r.h + a.l * b.h + a.h * b.l;
    return r;
}

my_uint128 umul128hi (my_uint128 a, my_uint128 b)
{
    my_uint128 t0, t1, t2, t3;
    t0 = umul64wide (a.l, b.l);
    t3 = add128 (umul64wide (a.h, b.l), make_my_uint128 (0ULL, t0.h));
    t1 = make_my_uint128 (0ULL, t3.l);
    t2 = make_my_uint128 (0ULL, t3.h);
    t1 = add128 (umul64wide (a.l, b.h), t1);
    return add128 (add128 (umul64wide (a.h, b.h), t2), make_my_uint128 (0ULL, t1.h));
}

double my_uint128_to_double (my_uint128 a)
{
    const int intbits = sizeof (a) * CHAR_BIT;
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    my_uint128 rnd, i = a;
    uint64_t j;
    int sh = 0;
    double r;

    // normalize integer so MSB is set
    if (lt128 (i, make_my_uint128(0x0000000000000001ULL, 0))) {i = lsl128 (i,64); sh += 64; }
    if (lt128 (i, make_my_uint128(0x0000000100000000ULL, 0))) {i = lsl128 (i,32); sh += 32; }
    if (lt128 (i, make_my_uint128(0x0001000000000000ULL, 0))) {i = lsl128 (i,16); sh += 16; }
    if (lt128 (i, make_my_uint128(0x0100000000000000ULL, 0))) {i = lsl128 (i, 8); sh +=  8; } 
    if (lt128 (i, make_my_uint128(0x1000000000000000ULL, 0))) {i = lsl128 (i, 4); sh +=  4; }
    if (lt128 (i, make_my_uint128(0x4000000000000000ULL, 0))) {i = lsl128 (i, 2); sh +=  2; }
    if (lt128 (i, make_my_uint128(0x8000000000000000ULL, 0))) {i = lsl128 (i, 1); sh +=  1; }
    // form mantissa with explicit integer bit 
    rnd = lsl128 (i, FP64_MANT_BITS);
    i = lsr128 (i, intbits - FP64_MANT_BITS);
    j = i.l;
    // add in exponent, taking into account integer bit of mantissa
    if (! eq128 (a, zero)) {
        j += (uint64_t)(FP64_EXPO_BIAS + (intbits-1) - 1 - sh) << (FP64_MANT_BITS-1);
    }
    // round to nearest or even
    rnd.h = rnd.h | (rnd.l != 0);
    if ((rnd.h > 0x8000000000000000ULL) || 
        ((rnd.h == 0x8000000000000000ULL) && (j & 1))) j++;
    // reinterpret bit pattern as IEEE-754 'binary64'
    r = uint64_as_double (j);
    return r;
}

my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
    my_uint128 quot, rem, tmp;
    uint64_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (dvsr);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

my_uint128 v[100000]; /* FIXME: size appropriately */

int main (void)
{
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    const my_uint128 one = make_my_uint128 (0ULL, 1ULL);
    my_uint128 dividend, divisor, quot, ref;
    int i, j, patterns, idx = 0, nbrBits = sizeof (v[0]) * CHAR_BIT;
    int patterns_done = 0;

    /* pattern class 1: 2**i */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = lsl128 (one, i);
        idx++;
    }
    /* pattern class 2: 2**i-1 */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = sub128 (lsl128 (one, i), one);
        idx++;
    }
    /* pattern class 3: 2**i+1 */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = add128 (lsl128 (one, i), one); 
        idx++;
    }
    /* pattern class 4: 2**i + 2**j */
    for (i = 0; i < nbrBits; i++) {
        for (j = 0; j < nbrBits; j++) {
            v [idx] = add128 (lsl128 (one, i), lsl128 (one, j));
            idx++;
        }
    }
    /* pattern class 5: 2**i - 2**j */
    for (i = 0; i < nbrBits; i++) {
        for (j = 0; j < nbrBits; j++) {
            v [idx] = sub128 (lsl128 (one, i), lsl128 (one, j));
            idx++;
        }
    }
    patterns = idx;
    /* pattern class 6: one's complement of pattern classes 1 through 5 */
    for (i = 0; i < patterns; i++) {
        v [idx] = not128 (v [i]);
        idx++;
    }
    /* pattern class 7: two's complement of pattern classes 1 through 5 */
    for (i = 0; i < patterns; i++) {
        v [idx] = sub128 (zero, v[i]);
        idx++;
    }
    patterns = idx;
    printf ("Starting pattern-based tests. Number of patterns: %d\n", patterns);

    for (long long int k = 0; k < 100000000000LL; k++) {
        if (k < patterns * patterns) {
            dividend = v [k / patterns];
            divisor  = v [k % patterns];
        } else {
            if (!patterns_done) {
                printf ("Starting random tests\n");
                patterns_done = 1;
            }
            dividend.l = KISS64;
            dividend.h = KISS64;
            divisor.h  = KISS64;
            divisor.l  = KISS64;
        }
        /* exclude cases with undefined results: division by zero */
        if (! eq128 (divisor, zero)) {
            quot = udiv128 (dividend, divisor);
            ref = bitwise_division_128 (dividend, divisor);
            if (! eq128 (quot, ref)) {
                printf ("@ (%016llx_%016llx, %016llx_%016llx): quot = %016llx_%016llx  ref=%016llx_%016llx\n", 
                        dividend.h, dividend.l, divisor.h, divisor.l, 
                        quot.h, quot.l, ref.h, ref.l);
                return EXIT_FAILURE;
            }
        }
    }
    printf ("unsigned 128-bit division: tests passed\n");
    return EXIT_SUCCESS;
}

สุดยอคำตอบขอบคุณ! ฉันจะให้เธอมากขึ้น-คะแนนเสียถ้าฉันสามารถ:)
@tgonzalez89 จะให้คุณเป็นความคิด:คำตอบที่มันเป็นตอนนี้เป็นตัวแทนขอ 10 ชั่วโมงทำงาน การออกแบบสำหรับการหารจำนวนเต็มค่อนข้างใหญ่;ใครสามารถอย่างง่ายดายขอร้องทั้งหมดอาจารย์วิทยานิพนธ์กับมัน สองคนหลักมาเป็นตัวเลข-ด้วย-ตัวเลข computations(จากตัวเลขฐานสองต้องสู radix ที่แนะนำให้อยู่ในความเห็น e.g. Knuth เป็นอัลกอริธึม D ใน TAOCP Vol. 2,โป๊ปหรือ-สไตน์)และวิธีการของจากมีทางเลือกทำซ้ำซึ่งยิ่งไปกว่านั้นเกี่ยวข้องกันปัญหาของกำลังสร้างคลังเริ่ม approximation. นี่ฉันส่วนใหญ่เป็นด้า-เหยียอย่างหลัสามารถของหนอนโดยการใช้ double ดิวิชั่นเป็นจุดเริ่มต้น.

tgonzalez89 · Answer 2 · 2021-11-21T07:08:09

นี่คือสิ่งที่ฉันต้องลงเอยถูกเข้ารหัสฉุกเฉิน ฉันมั่นใจว่ามีมากแค่ไหนเร็วขึ้นอื่นทำได้แล้แต่อย่างน้อยนี่คืองาน

จากพื้นฐาน: https://en.wikipedia.org/wiki/Division_algorithm#Integer_division_(ไม่ต้องเซ็น)_with_remainder. พัฒนาเป็นสำหรับเรื่องนี้โดยเฉพาะใช้-ดี

// q = (2^128 - 1) / d, where q is the 64 LSBs of the quotient
uint64_t two_pow_128_minus_1_div_d(uint64_t d) {
    uint64_t q = 0, r_hi = 0, r_lo = 0;

    for (int i = 127; i >= 0; --i) {
        r_hi = (r_hi << 1) | (r_lo >> 63);
        r_lo <<= 1;

        r_lo |= 1UL;

        if (r_hi || r_lo >= d) {
            const uint64_t borrow = d > r_lo;
            r_lo -= d;
            r_hi -= borrow;

            if (i < 64)
                q |= 1UL << i;
        }
    }
    return q;
}

ยังไงเพื่อแบ่งเป็น 128-bit dividend โดย 64-bit divisor อยู่ที่ไหน dividend เป็นชิ้นเป็น 1,และที่ไหนที่ฉันแค่ต้องการที่ 64 LSBs ของ quotient?

คำถาม

คำตอบที่ดีที่สุด

ในภาษาอื่นๆ

หน้านี้อยู่ในภาษาอื่นๆ

ดังอยู่ในนี้หมวดหมู่

ดังคำถามอยู่ในนี้หมวดหมู่