March Madness Day 3 – Hardware is fun!
Uncategorized
Add comments
Mar 032010
So as not to leave my fellow hardware engineers unrepresented, I present my entry for day 3 of March Madness:
The following code implements (poorly, no doubt) the 64-bit floating point reciprocal approximation unit from the Cray-1 (in Verilog 2001). Warning, not for the faint of heart =)
Here’s the code:
//The floating point reciprocal approximation unit //070ijx Floating Reciprocal approximation of (Sj) to Si //This instruction is execute in the reciprocal approximation unit. //The instruction forms an approximation to the reciprocal of the normalized //floatoing point quantity in Sj and enters the result into Si. This //instruction occurs in the divide sequence to compute the qutoient of //two floating point quantities as described in section 3 under floating //point arithmetic. //The reciprocal approximation instruction produces a result that is //accurate to 27 bits. A second approximation may be generated to //extend the accuracy to 47 bits using the reciprocal iteration instruction //Hold issue conditions //-> 034-037 in process //-> exchange in process //-> Si or Sj reserved //-> 174 in process; unit busy (VL) + 4 CPs //Execution time: Si ready in 14 CPs, instruction issue in 1 CP //assumes that bit 47 of Sj is a 1 (i.e. it is already normalized) //special cases //-An arithmetic error allows 17 CPs + 2 parcels to issue if the fp error flag is set //-(Si) is meaningless if (Sj) is not normalized; the unit assumes // that bit 47 of (Sj)=1; no test is made of this bit. //-(Sj) = 0 produces a range error; the result is meaningless //-(Sj) = 0 if j = 0 module float_recip(clk,i_sj,o_result); input wire clk; //system clock input wire [63:0] i_sj; output reg [63:0] o_result; reg [15:0] sj [8:0]; wire [3:0] x_init; reg [4:0] x0, x0_2, x0_3, x0_4; reg [9:0] x1_a; reg [10:0] x1_b1; reg [10:0] x1_b2; wire [15:0] x1_c; reg [7:0] x1, x1_2, x1_3, x1_4; reg [15:0] x2_a; reg [16:0] x2_b1; reg [16:0] x2_b2; wire [24:0] x2_c; reg [15:0] x2, x2_2, x2_3, x2_4; reg [31:0] x3_a; reg [32:0] x3_b1; reg [32:0] x3_b2; wire [48:0] x3_c; reg [47:0] x3; reg [47:0] x3_2; integer i; reg [14:0] exponent [12:0]; reg [12:0] sign; reg [12:0] is_half; always@(posedge clk) o_result <= is_half[12] ? {sign[12],exponent[12],1'b1,47'b0} : {sign[12],exponent[12],x3}; //Pipeline the i_sj signal along always@(posedge clk) begin sj[0] <= i_sj[47:32]; for( i=1; i<9; i=i+1) begin sj[i] <= sj[i-1]; end end //Detect if the input mantissa is 1/2 always@(posedge clk) begin is_half[0] <= (i_sj[47:40]==8'b10000000); for (i=1; i<13; i=i+1) begin is_half[i] <= is_half[i-1]; end end //////////////////////////////// // First Iteration // //////////////////////////////// //Clock 1 //look-up table to give us initial guess //Input = first 7 bits after MSB of starting mantissa //Output = first 4 bits after MSB of result mantissa recip_lut lut(.n(i_sj[46:40]), .mantissa(x_init)); always@(posedge clk) begin x0 <= {1'b1,x_init}; x0_2 <= x0; //pipelinin' x0_3 <= x0_2; x0_4 <= x0_3; end //Clock 2 //X0 * B should be close to 1 (but slightly more than) always@(posedge clk) x1_a <= x0 * sj[0][15:11]; //Clock 3 //2 - X0*B (should be slightly less than 1) always@(posedge clk) x1_b1 <= (11'b10000000000 - {1'b0,x1_a}); //Clock 4 //now shift it as necessary always@(posedge clk) x1_b2 <= x1_b1[10] ? x1_b1 : x1_b1[9] ? x1_b1 << 1 : x1_b1 << 2; //Clock 5 //X0*(2-X0*B) assign x1_c = x0_4 * x1_b2; //first only keep the 8 MSB's of the last round and shift if necessary always@(posedge clk) begin x1 <= x1_c[15] ? x1_c[15:8] : x1_c[14] ? x1_c[14:7] : x1_c[13:6]; x1_2 <= x1; x1_3 <= x1_2; x1_4 <= x1_3; end //////////////////////////////// // 2nd iteration // //////////////////////////////// //Clock 6 //X1 * B always@(posedge clk) x2_a <= x1 * sj[4][15:8]; //2 - X1*B //Clock 7 always@(posedge clk) x2_b1 <= (17'b10000000000000000 - {1'b0,x2_a}); //and shift as necessary //Clock 8 always@(posedge clk) x2_b2 <= x2_b1[16] ? x2_b1 : x2_b1[15] ? x2_b1 << 1 : x2_b1 << 2; //Clock 9 //X0*(2-X0*B) assign x2_c = x1_4 * x2_b2; //keep only the 16 MSB of the last round and shift as necessary always@(posedge clk) begin x2 <= x2_c[24] ? x2_c[24:9] : x2_c[23] ? x2_c[23:8] : x2_c[22:7]; x2_2 <= x2; x2_3 <= x2_2; x2_4 <= x2_3; end ////////////////////////////////// // 3rd Iteration // ////////////////////////////////// //Clock 10 //X2 * B always@(posedge clk) x3_a <= x2 * sj[8]; //Clock 11 //2 - X2*B always@(posedge clk) x3_b1 <= (33'b100000000000000000000000000000000 - {1'b0,x3_a}); //and shift as necessary //Clock 12 always@(posedge clk) x3_b2 <= x3_b1[32] ? x3_b1 : x3_b1[31] ? x3_b1 << 1 : x3_b1 << 2; //X0 * (2-X0*B) //Clock 13 assign x3_c = x2_4 * x3_b2; //shift x3 as necessary always@(posedge clk) begin x3[47:0] <= x3_c[48] ? x3_c[48:1] : x3_c[47] ? x3_c[47:0] : {x3_c[46:0],1'b0}; end //calculate and pipeline the exponent always@(posedge clk) for (i=1; i < 13; i = i + 1) begin exponent[0] <= (i_sj[47:40]==8'b10000000) ? (i_sj[62] ? (~(i_sj[62:48]-15'b10) + 15'b1) : (~(i_sj[62:48]+15'b10) + 15'b1)) : (i_sj[62] ? (~(i_sj[62:48]-15'b1) + 15'b1) : (~(i_sj[62:48]+15'b1) + 15'b1)) ; begin exponent[i] <= exponent[i-1]; end end //pipeline the sign always@(posedge clk) for (i=1; i < 13; i = i + 1) begin sign[0] <= i_sj[63]; begin sign[i] <= sign[i-1]; end end endmodule //And the look-up table! //This file is the look-up table for the initial guess // of the floating point reciprocol unit // (because it uses newton-raphson) //The input is the first 7 bits after the MSB of the input mantissa //the output is the 4 bits after the MSB of the result mantissa module recip_lut(n, mantissa); input wire [6:0] n; output reg [3:0] mantissa; always@* begin case(n[6:0]) 7'b0000000: mantissa = 0000; 7'b0000001: mantissa = 1111; 7'b0000010: mantissa = 1111; 7'b0000011: mantissa = 1111; 7'b0000100: mantissa = 1111; 7'b0000101: mantissa = 1110; 7'b0000110: mantissa = 1110; 7'b0000111: mantissa = 1110; 7'b0001000: mantissa = 1110; 7'b0001001: mantissa = 1101; 7'b0001010: mantissa = 1101; 7'b0001011: mantissa = 1101; 7'b0001100: mantissa = 1101; 7'b0001101: mantissa = 1101; 7'b0001110: mantissa = 1100; 7'b0001111: mantissa = 1100; 7'b0010000: mantissa = 1100; 7'b0010001: mantissa = 1100; 7'b0010010: mantissa = 1100; 7'b0010011: mantissa = 1011; 7'b0010100: mantissa = 1011; 7'b0010101: mantissa = 1011; 7'b0010110: mantissa = 1011; 7'b0010111: mantissa = 1011; 7'b0011000: mantissa = 1010; 7'b0011001: mantissa = 1010; 7'b0011010: mantissa = 1010; 7'b0011011: mantissa = 1010; 7'b0011100: mantissa = 1010; 7'b0011101: mantissa = 1010; 7'b0011110: mantissa = 1001; 7'b0011111: mantissa = 1001; 7'b0100000: mantissa = 1001; 7'b0100001: mantissa = 1001; 7'b0100010: mantissa = 1001; 7'b0100011: mantissa = 1001; 7'b0100100: mantissa = 1000; 7'b0100101: mantissa = 1000; 7'b0100110: mantissa = 1000; 7'b0100111: mantissa = 1000; 7'b0101000: mantissa = 1000; 7'b0101001: mantissa = 1000; 7'b0101010: mantissa = 1000; 7'b0101011: mantissa = 0111; 7'b0101100: mantissa = 0111; 7'b0101101: mantissa = 0111; 7'b0101110: mantissa = 0111; 7'b0101111: mantissa = 0111; 7'b0110000: mantissa = 0111; 7'b0110001: mantissa = 0111; 7'b0110010: mantissa = 0111; 7'b0110011: mantissa = 0110; 7'b0110100: mantissa = 0110; 7'b0110101: mantissa = 0110; 7'b0110110: mantissa = 0110; 7'b0110111: mantissa = 0110; 7'b0111000: mantissa = 0110; 7'b0111001: mantissa = 0110; 7'b0111010: mantissa = 0110; 7'b0111011: mantissa = 0101; 7'b0111100: mantissa = 0101; 7'b0111101: mantissa = 0101; 7'b0111110: mantissa = 0101; 7'b0111111: mantissa = 0101; 7'b1000000: mantissa = 0101; 7'b1000001: mantissa = 0101; 7'b1000010: mantissa = 0101; 7'b1000011: mantissa = 0101; 7'b1000100: mantissa = 0100; 7'b1000101: mantissa = 0100; 7'b1000110: mantissa = 0100; 7'b1000111: mantissa = 0100; 7'b1001000: mantissa = 0100; 7'b1001001: mantissa = 0100; 7'b1001010: mantissa = 0100; 7'b1001011: mantissa = 0100; 7'b1001100: mantissa = 0100; 7'b1001101: mantissa = 0011; 7'b1001110: mantissa = 0011; 7'b1001111: mantissa = 0011; 7'b1010000: mantissa = 0011; 7'b1010001: mantissa = 0011; 7'b1010010: mantissa = 0011; 7'b1010011: mantissa = 0011; 7'b1010100: mantissa = 0011; 7'b1010101: mantissa = 0011; 7'b1010110: mantissa = 0011; 7'b1010111: mantissa = 0011; 7'b1011000: mantissa = 0010; 7'b1011001: mantissa = 0010; 7'b1011010: mantissa = 0010; 7'b1011011: mantissa = 0010; 7'b1011100: mantissa = 0010; 7'b1011101: mantissa = 0010; 7'b1011110: mantissa = 0010; 7'b1011111: mantissa = 0010; 7'b1100000: mantissa = 0010; 7'b1100001: mantissa = 0010; 7'b1100010: mantissa = 0010; 7'b1100011: mantissa = 0010; 7'b1100100: mantissa = 0001; 7'b1100101: mantissa = 0001; 7'b1100110: mantissa = 0001; 7'b1100111: mantissa = 0001; 7'b1101000: mantissa = 0001; 7'b1101001: mantissa = 0001; 7'b1101010: mantissa = 0001; 7'b1101011: mantissa = 0001; 7'b1101100: mantissa = 0001; 7'b1101101: mantissa = 0001; 7'b1101110: mantissa = 0001; 7'b1101111: mantissa = 0001; 7'b1110000: mantissa = 0001; 7'b1110001: mantissa = 0000; 7'b1110010: mantissa = 0000; 7'b1110011: mantissa = 0000; 7'b1110100: mantissa = 0000; 7'b1110101: mantissa = 0000; 7'b1110110: mantissa = 0000; 7'b1110111: mantissa = 0000; 7'b1111000: mantissa = 0000; 7'b1111001: mantissa = 0000; 7'b1111010: mantissa = 0000; 7'b1111011: mantissa = 0000; 7'b1111100: mantissa = 0000; 7'b1111101: mantissa = 0000; 7'b1111110: mantissa = 0000; 7'b1111111: mantissa = 0000; endcase end endmodule |
