-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimul32_improve.s
279 lines (231 loc) · 5.91 KB
/
imul32_improve.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
.data
# dword -> 8 bytes
# word -> 4 bytes
test_1: .word 0x40200000, 0x40200000 # 2.5 * 2.5 = 6.25
test_2: .word 0x3e800000, 0x40800000 # 0.25 * 4 = 1
test_3: .word 0xc0600000, 0x417a0000 # -3.5 * 15.625 = -54.6875
msg_string: .string "floating point multiplication \n"
enter: .string "\n"
.text
main:
addi sp, sp, -12
# push pointers of test data onto the stack
la t0, test_1
sw t0, 0(sp)
la t0, test_2
sw t0, 4(sp)
la t0, test_3
sw t0, 8(sp)
# initialize main_loop
addi s0, zero, 3 # s0 : number of test case
addi s1, zero, 0 # s1 : test case counter
mv s2, sp # s2 : points to test_1
main_loop:
la a0, msg_string
li a7, 4 # print string
ecall
lw a0, 0(s2) # a0 : pointer to first test data
lw a1, 4(a0) # a1 : second data in test data
lw a0, 0(a0) # a0 : first data in test data
jal fmul32 # a0 : result of fmul32
# print the result
li a7, 2 # print float
ecall # print result
la a0, enter
li a7, 4
ecall #print next line
addi s2, s2, 4 # s2 : points to next test_data
addi s1, s1, 1 # counter++
bne s1, s0, main_loop
addi sp, sp, 12
li a7, 10
ecall
fmul32:
addi sp, sp, -24
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
sw s3, 16(sp)
sw s4, 20(sp)
srli s0, a0, 31
srli s1, a1, 31
xor s0, s0, s1 # s0 = sign_a ^ sign_b -> sign bit
li t0, 0x7FFFFF
li t1, 0x800000
and s1, a0, t0
or s1, s1, t1 # s1 = mantissa_a
and s2, a1, t0
or s2, s2, t1 # s2 = mantissa_b
srli s3, a0, 23
andi s3, s3, 0xFF # s3 = exp_a
srli s4, a1, 23
andi s4, s4, 0xFF # s4 = exp_b
mv a0, s1
mv a1, s2
jal imul32
mv s1, a0
mv s2, a1 # s1,s2 = mantissa_a * mantissa_b
srli s1, s1, 23
slli s2, s2, 9
or s1, s1, s2 # s1 = mantissa_a * mantissa_b >> 23
# s2 dont care
mv a0, s1 # a0 = mantissa_a * mantissa_b >> 23
li a2, 24 # a1 = 24
jal getbit
srl s1, s1, a0 # s1 = mantissa_a * mantissa_b >> 23 >> getbit
add s3, s3, s4 # s3 = exp_a + exp_b
addi s3, s3, -127
# s4 dont care
# int32_t er = mshift ? inc(ertmp) : ertmp;
# skip inc
add s3, s3, a0 # s3 = er
slli s0, s0, 31 # s0 = (sr << 31)
andi s3, s3, 0xFF # s3 = er
slli s3, s3, 23 # s3 = er << 23
li t0, 0x7FFFFF
and s1, s1, t0 # s1 = mr
or s0, s0, s3 # s0 = (sr << 31) | (er << 23)
or s0, s0, s1 # s0 = (sr << 31) | (er << 23) | mr
mv a0, s0
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
lw s3, 16(sp)
lw s4, 20(sp)
addi sp, sp, 24
ret
# imul32 imporve
imul32:
addi sp, sp, -24
sw ra,0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
sw s4,20(sp)
mv s0, a0 # s0 = a
mv s1, a1 # s1 = b
li s2, 0 # s2,s1 is result
li t0, 0 # t0 : counter
li t1, 32 # t1 : loop bound
imul32_loop:
beq t0, t1, imul32_end
andi t2, s1, 1 # if (result & 1)
beq t2, zero, imul32_skip
add s2, s2, s0 # result += a
imul32_skip:
srli s1, s1, 1 # result >> 1
slli t3, s2, 31 # result << (32 - 1)
srli s2, s2, 1 # result >> 1
or s1, s1, t3 # result | (result << (32 - 1))
addi t0, t0, 1 # i++
j imul32_loop
imul32_end:
mv a0, s1
mv a1, s2
lw ra,0(sp)
lw s0,4(sp)
lw s1,8(sp)
lw s2,12(sp)
lw s3,16(sp)
lw s4,20(sp)
addi sp,sp,24
ret
getbit:
addi sp, sp, -8
sw ra, 0(sp)
sw s0, 4(sp)
li s0, 32
bge a2, s0, getbit_l # if (pos >= 32);
srl a0, a0, a2
andi a0, a0, 1
j getbit_end
getbit_l:
sub s0, a2, s0
srl a1, a1, s0
andi a1, a1, 1
mv a0, a1
getbit_end:
lw ra, 0(sp)
lw s0, 4(sp)
addi sp, sp, 8
ret
inc:
addi sp, sp -12
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp) # save parameters
jal mask_lowest_zero
mv t0, a0
mv t1, a1 # t1,t0 mask
lw a0, 4(sp)
lw a1, 8(sp) # restore parameters
slli t3, t1, 1
srli t2, t0, 31
or t3, t3, t2
slli t2, t0, 1 # a1,a0 << 1
ori t2, t2, 1 # a1,a0 | 1
xor t2, t2, t0
xor t3, t3, t1 # t2,t3 z1
not t1, t1
not t0, t0 # ~mask
and a1, a1, t1
and a0, a0, t0 # a1,a0 & ~mask
or a1, a1, t3
or a0, a0, t2 # a1,a0 | z1
lw ra, 0(sp)
addi sp, sp, 12
ret
mask_lowest_zero:
addi sp, sp, -4
sw ra, 0(sp)
# a0 low , a1 high
# mask &= (mask << 1) | 1;
# a1,a0 = 64 bits parameter
slli t1, a1, 1
srli t0, a0, 31
or t1, t1, t0 # t1,t0
slli t0, a0, 1 # t0 = a0 << 1
ori t0, t0, 1 # x = x | 1
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 2) | 0x3;
slli t1, a1, 2
srli t0, a0, 30
or t1, t1, t0 # left 32 bits
slli t0, a0, 2 # t0 = a0 << 2
ori t0, t0, 3 # x = x | 3
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 4) | 0xF;
slli t1, a1, 4
srli t0, a0, 28
or t1, t1, t0 # left 32 bits
slli t0, a0, 4 # t0 = a0 << 4
ori t0, t0, 0xF # x = x | 0xF
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 8) | 0xFF;
slli t1, a1, 8
srli t0, a0, 24
or t1, t1, t0 # left 32 bits
slli t0, a0, 8 # t0 = a0 << 8
ori t0, t0, 0xFF # x = x | 0xFF
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 16) | 0xFFFF;
li t3 , 0xFFFF # lui + addi
slli t1, a1, 16
srli t0, a0, 16
or t1, t1, t0 # left 32 bits
slli t0, a0, 16 # t0 = a0 << 16
or t0, t0, t3 # x = x | 0xFFFF
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 32) | 0xFFFFFFFF;
and a1, a1,a0
lw ra, 0(sp)
addi sp, sp, 4
ret