Mar 032010
 

So as not to leave my fellow hardware engineers unrepresented, I present my entry for day 3 of March Madness:

The following code implements (poorly, no doubt) the 64-bit floating point reciprocal approximation unit from the Cray-1 (in Verilog 2001). Warning, not for the faint of heart =)

Here’s the code:

//The floating point reciprocal approximation unit
//070ijx        Floating Reciprocal approximation of (Sj) to Si
//This instruction is execute in the reciprocal approximation unit.
//The instruction forms an approximation to the reciprocal of the normalized
//floatoing point quantity in Sj and enters the result into Si. This
//instruction occurs in the divide sequence to compute the qutoient of
//two floating point quantities as described in section 3 under floating
//point arithmetic.
//The reciprocal approximation instruction produces a result that is
//accurate to 27 bits. A second approximation may be generated to
//extend the accuracy to 47 bits using the reciprocal iteration instruction
//Hold issue conditions
//-> 034-037 in process
//-> exchange in process
//-> Si or Sj reserved
//-> 174 in process; unit busy (VL) + 4 CPs
//Execution time: Si ready in 14 CPs, instruction issue in 1 CP
//assumes that bit 47 of Sj is a 1 (i.e. it is already normalized)
//special cases
//-An arithmetic error allows 17 CPs + 2 parcels to issue if the fp error flag is set
//-(Si) is meaningless if (Sj)  is not normalized; the unit assumes
// that bit 47 of (Sj)=1; no test is made of this bit.
//-(Sj) = 0 produces a range error; the result is meaningless
//-(Sj) = 0 if j = 0
module float_recip(clk,i_sj,o_result);
input  wire clk;        //system clock
input  wire [63:0] i_sj;
output reg [63:0] o_result;
reg  [15:0]  sj [8:0];
wire [3:0]  x_init;
reg  [4:0]  x0, x0_2, x0_3, x0_4;
reg  [9:0]  x1_a;
reg  [10:0] x1_b1;
reg  [10:0] x1_b2;
wire [15:0] x1_c;
reg  [7:0]  x1, x1_2, x1_3, x1_4;
reg  [15:0] x2_a;
reg  [16:0] x2_b1;
reg  [16:0] x2_b2;
wire [24:0] x2_c;
reg  [15:0] x2, x2_2, x2_3, x2_4;
reg  [31:0] x3_a;
reg  [32:0] x3_b1;
reg  [32:0] x3_b2;
wire [48:0] x3_c;
reg  [47:0] x3;
reg  [47:0] x3_2;
integer i;
reg [14:0] exponent [12:0];
reg [12:0] sign;
reg [12:0] is_half;
always@(posedge clk)
o_result <= is_half[12] ? {sign[12],exponent[12],1'b1,47'b0} : {sign[12],exponent[12],x3};
//Pipeline the i_sj signal along
always@(posedge clk)
begin
sj[0] <= i_sj[47:32];
for( i=1; i<9; i=i+1)
begin
sj[i] <= sj[i-1];
end
end
//Detect if the input mantissa is 1/2
always@(posedge clk)
begin
is_half[0] <= (i_sj[47:40]==8'b10000000);
for (i=1; i<13; i=i+1)
begin
is_half[i] <= is_half[i-1];
end
end
////////////////////////////////
//      First Iteration       //
////////////////////////////////
//Clock 1
//look-up table to give us initial guess
//Input  = first 7 bits after MSB of starting mantissa
//Output = first 4 bits after MSB of result mantissa
recip_lut lut(.n(i_sj[46:40]), .mantissa(x_init));
always@(posedge clk)
begin
x0   <= {1'b1,x_init};
x0_2 <= x0;               //pipelinin'
x0_3 <= x0_2;
x0_4 <= x0_3;
end
//Clock 2
//X0 * B should be close to 1 (but slightly more than)
always@(posedge clk)
x1_a <= x0 * sj[0][15:11];
//Clock 3
//2 - X0*B (should be slightly less than 1)
always@(posedge clk)
x1_b1 <= (11'b10000000000 - {1'b0,x1_a});
//Clock 4
//now shift it as necessary
always@(posedge clk)
x1_b2 <= x1_b1[10] ? x1_b1 : x1_b1[9] ? x1_b1 << 1 :  x1_b1 << 2;
//Clock 5
//X0*(2-X0*B)
assign x1_c = x0_4 * x1_b2;
//first only keep the 8 MSB's of the last round and shift if necessary
always@(posedge clk)
begin
x1 <= x1_c[15] ? x1_c[15:8] : x1_c[14] ? x1_c[14:7] : x1_c[13:6];
x1_2 <= x1;
x1_3 <= x1_2;
x1_4 <= x1_3;
end
////////////////////////////////
//         2nd iteration      //
////////////////////////////////
//Clock 6
//X1 * B
always@(posedge clk)
x2_a <= x1 * sj[4][15:8];
//2 - X1*B
//Clock 7
always@(posedge clk)
x2_b1 <= (17'b10000000000000000 - {1'b0,x2_a});
//and shift as necessary
//Clock 8
always@(posedge clk)
x2_b2 <= x2_b1[16] ? x2_b1 : x2_b1[15] ? x2_b1 << 1 : x2_b1 << 2;
//Clock 9
//X0*(2-X0*B)
assign x2_c = x1_4 * x2_b2;
//keep only the 16 MSB of the last round and shift as necessary
always@(posedge clk)
begin
x2 <= x2_c[24] ? x2_c[24:9] : x2_c[23] ? x2_c[23:8] : x2_c[22:7];
x2_2 <= x2;
x2_3 <= x2_2;
x2_4 <= x2_3;
end
//////////////////////////////////
//   3rd Iteration              //
//////////////////////////////////
//Clock 10
//X2 * B
always@(posedge clk)
x3_a <= x2 * sj[8];
//Clock 11
//2 - X2*B
always@(posedge clk)
x3_b1 <= (33'b100000000000000000000000000000000 - {1'b0,x3_a});
//and shift as necessary
//Clock 12
always@(posedge clk)
x3_b2 <= x3_b1[32] ? x3_b1 : x3_b1[31] ? x3_b1 << 1 : x3_b1 << 2;
//X0 * (2-X0*B)
//Clock 13
assign x3_c = x2_4 * x3_b2;
//shift x3 as necessary
always@(posedge clk)
begin
x3[47:0] <= x3_c[48] ? x3_c[48:1] : x3_c[47] ? x3_c[47:0] : {x3_c[46:0],1'b0};
end
//calculate and pipeline the exponent
always@(posedge clk)
for (i=1; i < 13; i = i + 1)
begin
exponent[0] <= (i_sj[47:40]==8'b10000000) ? (i_sj[62] ? (~(i_sj[62:48]-15'b10) + 15'b1) : (~(i_sj[62:48]+15'b10) + 15'b1)) : (i_sj[62] ? (~(i_sj[62:48]-15'b1) + 15'b1) : (~(i_sj[62:48]+15'b1) + 15'b1)) ;
begin
exponent[i] <= exponent[i-1];
end
end
//pipeline the sign
always@(posedge clk)
for (i=1; i < 13; i = i + 1)
begin
sign[0] <= i_sj[63];
begin
sign[i] <= sign[i-1];
end
end
endmodule

//And the look-up table!
//This file is the look-up table for the initial guess
// of the floating point reciprocol unit
// (because it uses newton-raphson)
//The input is the first 7 bits after the MSB of the input mantissa
//the output is the 4 bits after the MSB of the result mantissa
module recip_lut(n, mantissa);
input  wire [6:0] n;
output reg [3:0] mantissa;
always@*
begin
case(n[6:0])
7'b0000000: mantissa = 0000;
7'b0000001: mantissa = 1111;
7'b0000010: mantissa = 1111;
7'b0000011: mantissa = 1111;
7'b0000100: mantissa = 1111;
7'b0000101: mantissa = 1110;
7'b0000110: mantissa = 1110;
7'b0000111: mantissa = 1110;
7'b0001000: mantissa = 1110;
7'b0001001: mantissa = 1101;
7'b0001010: mantissa = 1101;
7'b0001011: mantissa = 1101;
7'b0001100: mantissa = 1101;
7'b0001101: mantissa = 1101;
7'b0001110: mantissa = 1100;
7'b0001111: mantissa = 1100;
7'b0010000: mantissa = 1100;
7'b0010001: mantissa = 1100;
7'b0010010: mantissa = 1100;
7'b0010011: mantissa = 1011;
7'b0010100: mantissa = 1011;
7'b0010101: mantissa = 1011;
7'b0010110: mantissa = 1011;
7'b0010111: mantissa = 1011;
7'b0011000: mantissa = 1010;
7'b0011001: mantissa = 1010;
7'b0011010: mantissa = 1010;
7'b0011011: mantissa = 1010;
7'b0011100: mantissa = 1010;
7'b0011101: mantissa = 1010;
7'b0011110: mantissa = 1001;
7'b0011111: mantissa = 1001;
7'b0100000: mantissa = 1001;
7'b0100001: mantissa = 1001;
7'b0100010: mantissa = 1001;
7'b0100011: mantissa = 1001;
7'b0100100: mantissa = 1000;
7'b0100101: mantissa = 1000;
7'b0100110: mantissa = 1000;
7'b0100111: mantissa = 1000;
7'b0101000: mantissa = 1000;
7'b0101001: mantissa = 1000;
7'b0101010: mantissa = 1000;
7'b0101011: mantissa = 0111;
7'b0101100: mantissa = 0111;
7'b0101101: mantissa = 0111;
7'b0101110: mantissa = 0111;
7'b0101111: mantissa = 0111;
7'b0110000: mantissa = 0111;
7'b0110001: mantissa = 0111;
7'b0110010: mantissa = 0111;
7'b0110011: mantissa = 0110;
7'b0110100: mantissa = 0110;
7'b0110101: mantissa = 0110;
7'b0110110: mantissa = 0110;
7'b0110111: mantissa = 0110;
7'b0111000: mantissa = 0110;
7'b0111001: mantissa = 0110;
7'b0111010: mantissa = 0110;
7'b0111011: mantissa = 0101;
7'b0111100: mantissa = 0101;
7'b0111101: mantissa = 0101;
7'b0111110: mantissa = 0101;
7'b0111111: mantissa = 0101;
7'b1000000: mantissa = 0101;
7'b1000001: mantissa = 0101;
7'b1000010: mantissa = 0101;
7'b1000011: mantissa = 0101;
7'b1000100: mantissa = 0100;
7'b1000101: mantissa = 0100;
7'b1000110: mantissa = 0100;
7'b1000111: mantissa = 0100;
7'b1001000: mantissa = 0100;
7'b1001001: mantissa = 0100;
7'b1001010: mantissa = 0100;
7'b1001011: mantissa = 0100;
7'b1001100: mantissa = 0100;
7'b1001101: mantissa = 0011;
7'b1001110: mantissa = 0011;
7'b1001111: mantissa = 0011;
7'b1010000: mantissa = 0011;
7'b1010001: mantissa = 0011;
7'b1010010: mantissa = 0011;
7'b1010011: mantissa = 0011;
7'b1010100: mantissa = 0011;
7'b1010101: mantissa = 0011;
7'b1010110: mantissa = 0011;
7'b1010111: mantissa = 0011;
7'b1011000: mantissa = 0010;
7'b1011001: mantissa = 0010;
7'b1011010: mantissa = 0010;
7'b1011011: mantissa = 0010;
7'b1011100: mantissa = 0010;
7'b1011101: mantissa = 0010;
7'b1011110: mantissa = 0010;
7'b1011111: mantissa = 0010;
7'b1100000: mantissa = 0010;
7'b1100001: mantissa = 0010;
7'b1100010: mantissa = 0010;
7'b1100011: mantissa = 0010;
7'b1100100: mantissa = 0001;
7'b1100101: mantissa = 0001;
7'b1100110: mantissa = 0001;
7'b1100111: mantissa = 0001;
7'b1101000: mantissa = 0001;
7'b1101001: mantissa = 0001;
7'b1101010: mantissa = 0001;
7'b1101011: mantissa = 0001;
7'b1101100: mantissa = 0001;
7'b1101101: mantissa = 0001;
7'b1101110: mantissa = 0001;
7'b1101111: mantissa = 0001;
7'b1110000: mantissa = 0001;
7'b1110001: mantissa = 0000;
7'b1110010: mantissa = 0000;
7'b1110011: mantissa = 0000;
7'b1110100: mantissa = 0000;
7'b1110101: mantissa = 0000;
7'b1110110: mantissa = 0000;
7'b1110111: mantissa = 0000;
7'b1111000: mantissa = 0000;
7'b1111001: mantissa = 0000;
7'b1111010: mantissa = 0000;
7'b1111011: mantissa = 0000;
7'b1111100: mantissa = 0000;
7'b1111101: mantissa = 0000;
7'b1111110: mantissa = 0000;
7'b1111111: mantissa = 0000;
endcase
end
endmodule