-
Notifications
You must be signed in to change notification settings - Fork 63
/
x86-mem-studies.asm
221 lines (182 loc) · 3.8 KB
/
x86-mem-studies.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
BITS 64
default rel
%include "x86-helpers.asm"
nasm_util_assert_boilerplate
thunk_boilerplate
; classic pointer chasing benchmark
define_bench replay_crossing
push_callee_saved
mov rbp, rsp
and rsp, -64
sub rsp, 66 ; rsp points 2 bytes before a 64b boundary
mov [rsp], rsp ; set up the pointer loop
mov rax, rsp
mov rbx, rsp
mov rcx, rsp
xor r15, r15
mov r8, 1
mov rcx, rsp
align 64
.top:
%rep 100
mov rax, [rax + r15]
add rax, 0
add rax, 0
add rax, 0
lfence
%endrep
dec rdi
jnz .top
mov rsp, rbp
pop_callee_saved
ret
%macro fw_define_start 1
define_bench %1
push rbp
mov rbp, rsp
and rsp, -64
sub rsp, 128
lea rcx, [rsp + 10]
sub rcx, 10
mov eax, 1
vpcmpeqd xmm0, xmm0, xmm0
jmp .top
align 32
.top:
%endmacro
%macro fw_define_end 0
dec rdi
jnz .top
mov rsp, rbp
pop rbp
ret
%endmacro
fw_define_start fw_write_read
%rep 100
mov [rsp], rax
mov rax, [rsp]
%endrep
fw_define_end
; an exactly overlapping forward, but which splits a cache line
fw_define_start fw_write_read_cl_split
%rep 100
mov [rsp + 63], rax
mov rax, [rsp + 63]
%endrep
fw_define_end
fw_define_start fw_write_read_rcx
%rep 100
mov [rcx], rax
mov rax, [rcx]
%endrep
fw_define_end
; %1 the number of reads following the write
%macro define_fw_write_read_rcxX 1
fw_define_start fw_write_read_rcx%1
%rep 100
mov [rcx], rax
times %1 mov rax, [rcx]
%endrep
fw_define_end
%endmacro
define_fw_write_read_rcxX 2
define_fw_write_read_rcxX 3
define_fw_write_read_rcxX 4
define_fw_write_read_rcxX 5
fw_define_start fw_write_read_rcx4s
%rep 100
times 4 mov [rcx], rax
mov rax, [rcx]
%endrep
fw_define_end
fw_define_start fw_write_readx
%rep 100
vmovdqa [rsp], xmm0
vmovdqa xmm0, [rsp]
%endrep
fw_define_end
fw_define_start fw_split_write_read
%rep 100
mov [rsp], rax
mov [rsp + 8], rax
vmovdqa xmm0, [rsp]
%endrep
fw_define_end
fw_define_start fw_split_write_read_chained
%rep 100
mov [rsp], rax
mov [rsp + 8], rax
vmovdqa xmm0, [rsp]
vmovq rax, xmm0
%endrep
fw_define_end
fw_define_start fw_write_split_read
%rep 100
vmovdqa [rsp], xmm0
mov rax, [rsp]
vmovq xmm0, rax
%endrep
fw_define_end
; same as fw_write_split_read, but reads from both halves of the 16b written value
; this mostly just adds 1 cycle to the time without revealing any new uarch details
; on current uarch, but maybe it will false in the future
fw_define_start fw_write_split_read_both
%rep 100
vmovdqa [rsp], xmm0
mov rax, [rsp]
add rax, [rsp + 8]
vmovq xmm0, rax
%endrep
fw_define_end
; write a null NT line then one extra
;
; %1 size of store in bytes
; %2 full instruction to use
; %3 0 or 1
; 0: normal full line write
; 1: extra write per line
%macro define_nt_extra 3
%assign BITSIZE (%1 * 8)
%if %3 == 0
define_bench nt_normal_ %+ BITSIZE
%else
define_bench nt_extra_ %+ BITSIZE
%endif
mov rdx, [rsi + region.size]
mov rsi, [rsi + region.start]
xor eax, eax
mov r8, -1
vpcmpeqd ymm0, ymm0, ymm0
.top:
mov rax, rdx
mov rcx, rsi
.inner:
%assign offset 0
%rep (64 / %1)
%2
%assign offset (offset + %1)
%endrep
%if %3 != 0
%assign offset 0
%rep 5
%2
%assign offset ((offset + %1) % 64)
%endrep
%endif
%undef offset ; needed to prevent offset from being expanded wrongly in the macro invocations below
add rcx, 64
sub rax, 64
jge .inner
dec rdi
jnz .top
ret
%endmacro
%macro define_nt_both 2
define_nt_extra %1, {%2}, 0
define_nt_extra %1, {%2}, 1
%endmacro
define_nt_both 4, {movnti [rcx + offset], r8d}
define_nt_both 8, {movnti [rcx + offset], r8 }
define_nt_both 16, {vmovntdq [rcx + offset], xmm0}
define_nt_both 32, {vmovntdq [rcx + offset], ymm0}
define_nt_both 64, {vmovntdq [rcx + offset], zmm0}