patterncppMinor

Mostly portable 128 by 64 bit division

Submitted by: @import:stackexchange-codereview·Mar 10, 2026·

Viewed 0 times

codereview cpp c++stackoverflow algorithm c++11 bitwise integer

128bitdivisionmostlyportable

Problem

I wrote this out of curiosity. It is based on the theory behind Knuth's Algorithm D and is intended to emulate the behavior of the x86 div instruction (though the result is truncated instead of raising an exception for overflow).

I am primarily interested in correctness (it seems to work correctly), and have not yet compared performance with anything simpler.

Two's complement hardware is assumed.

```
uint64_t div(uint64_t a_lo, uint64_t a_hi, uint64_t b, uint64_t &r)
{
uint64_t p_lo;
uint64_t p_hi;
uint64_t q = 0;

auto r_hi = a_hi;
auto r_lo = a_lo;

int s = 0;
if(0 == (b >> 63)){

// Normalize so quotient estimates are
// no more than 2 in error.

// Note: If any bits get shifted out of
// r_hi at this point, the result would
// overflow.

s = 63 - bsr(b);
const auto t = 64 - s;

b > t);
r_lo > 32;

/*
The first full-by-half division places b
across r_hi and r_lo, making the reduction
step a little complicated.

To make this easier, u_hi and u_lo will hold
a shifted image of the remainder.

[u_hi|| ][u_lo|| ]
[r_hi|| ][r_lo|| ]
[ b || ]
[p_hi|| ][p_lo|| ]
|
V
[q_hi|| ]
*/

auto q_hat = r_hi / b_hi;

p_lo = mul(b, q_hat, p_hi);

const auto u_hi = r_hi >> 32;
const auto u_lo = (r_hi > 32);

// r -= b*q_hat
//
// At most 2 iterations of this...
while(
(p_hi > u_hi) ||
((p_hi == u_hi) && (p_lo > u_lo))
)
{
if(p_lo > 32);

if(w_lo > r_lo){
++w_hi;
}

r_lo -= w_lo;
r_hi -= w_hi;

q = q_hat > 32)) / b_hi;

p_lo = mul(b, q_hat, p_hi);

// r -= b*q_hat
//
// ...and at most 2 iterations of this.
while(
(p_hi > r_hi) ||
((p_hi == r_hi) && (p_lo > r_lo))
)
{
if(p_lo > s;

return q;
}

Solution

The original code seemed too complicated for this so I set out to write a simpler version. Should be fairly portable too, and fast as it is just bit shifts and subtraction.

(It works similarly to how one would accomplish division in assembly language on a processor without hardware division.)

// 128-bit / 64-bit unsigned divide

#include 
#include 

int main(void)
{
  // numerator
  uint64_t a_lo = 1234567;
  uint64_t a_hi = 0;

  // denominator
  uint64_t b = 10;

  // quotient
  uint64_t q = a_lo > 63;
  uint64_t temp_carry = 0;
  int i;

  for(i = 0; i > 63;
    rem = b)
      {
        carry = 1;
      }
      else
      {
        temp_carry = q >> 63;
        q > 63;
    q <<= 1;
    q |= carry;
    carry = temp_carry;
  }

  printf("quotient = %llu\n", (long long unsigned int)q);
  printf("remainder = %llu\n", (long long unsigned int)rem);
  return 0;
}

Code Snippets

// 128-bit / 64-bit unsigned divide

#include <stdio.h>
#include <stdint.h>

int main(void)
{
  // numerator
  uint64_t a_lo = 1234567;
  uint64_t a_hi = 0;

  // denominator
  uint64_t b = 10;

  // quotient
  uint64_t q = a_lo << 1;

  // remainder
  uint64_t rem = a_hi;

  uint64_t carry = a_lo >> 63;
  uint64_t temp_carry = 0;
  int i;

  for(i = 0; i < 64; i++)
  {
    temp_carry = rem >> 63;
    rem <<= 1;
    rem |= carry;
    carry = temp_carry;

    if(carry == 0)
    {
      if(rem >= b)
      {
        carry = 1;
      }
      else
      {
        temp_carry = q >> 63;
        q <<= 1;
        q |= carry;
        carry = temp_carry;
        continue;
      }
    }

    rem -= b;
    rem -= (1 - carry);
    carry = 1;
    temp_carry = q >> 63;
    q <<= 1;
    q |= carry;
    carry = temp_carry;
  }

  printf("quotient = %llu\n", (long long unsigned int)q);
  printf("remainder = %llu\n", (long long unsigned int)rem);
  return 0;
}

Context

StackExchange Code Review Q#67962, answer score: 3

Revisions (0)

No revisions yet.