-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsse.cpp
105 lines (89 loc) · 4.65 KB
/
sse.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include "common.hpp"
void test_sse()
{
/* 128 */
GEN_throughput_only(Xmm, "loadps",
(g->movaps(dst, g->ptr[g->rdx])),
false, OT_INT);
GEN_latency_only(Xmm, "loadps->movq",
(g->movaps(dst, g->ptr[g->rdx + g->rdi])); (g->movq(g->rdi, dst));,
false, OT_INT);
GEN(Xmm, "movq->movq",
(g->movq(g->rdi,src));(g->movq(dst,g->rdi));,
false, OT_INT);
GEN(Xmm, "xorps", (g->xorps(dst, src)), false, OT_FP32);
GEN(Xmm, "addps", (g->addps(dst, src)), false, OT_FP32);
GEN(Xmm, "mulps", (g->mulps(dst, src)), false, OT_FP32);
GEN(Xmm, "divps", (g->divps(dst, src)), false, OT_FP32);
GEN(Xmm, "divpd", (g->divpd(dst, src)), false, OT_FP64);
GEN(Xmm, "rsqrtps", (g->rsqrtps(dst, dst)), false, OT_FP32);
GEN(Xmm, "rcpps", (g->rcpps(dst, dst)), false, OT_FP32);
GEN(Xmm, "blendps", (g->blendps(dst, src, 0)), false, OT_FP32);
GEN_latency(Xmm, "blendvps",
(g->blendvps(dst, src));(g->xorps(dst,dst)),
(g->blendvps(dst, src)),
false, OT_FP32);
GEN(Xmm, "pshufb", (g->pshufb(dst, src)), false, OT_INT);
GEN(Xmm, "shufps", (g->shufps(dst, src, 0)), false, OT_FP32);
GEN(Xmm, "pmullw", (g->pmullw(dst, src)), false, OT_INT);
GEN(Xmm, "phaddd", (g->phaddd(dst, src)), false, OT_INT);
GEN(Xmm, "haddps", (g->phaddd(dst, src)), false, OT_FP32);
GEN(Xmm, "pinsrd",
(g->pinsrb(dst, g->edx, 0)), false, OT_INT);
GEN_latency_only(Xmm, "pinsrd->pextr", (g->pinsrb(dst, g->edx, 0));(g->pextrd(g->edx,dst,0)), false, OT_INT);
GEN(Xmm, "dpps", (g->dpps(dst, src, 0xff)), false, OT_FP32);
GEN(Xmm, "cvtps2dq", (g->cvtps2dq(dst, src)), false, OT_FP32);
GEN_throughput_only(Xmm, "pmovmskb", (g->pmovmskb(g->edx,src)), false, OT_INT);
GEN_latency_only(Xmm, "pmovmskb->movq",
(g->pmovmskb(g->edx,src));(g->movq(src,g->rdx)),
false, OT_INT);
GEN_latency_only(Xmm, "movq->movq",
(g->movq(g->rdx,src));(g->movq(src,g->rdx)),
false, OT_INT);
GEN_latency(Xmm, "movaps [mem]",
(g->movaps(dst, g->ptr[g->rdx])),
(g->movaps(dst, g->ptr[g->rdx + g->rdi])); (g->movq(g->rdi, dst)); ,
false, OT_FP32);
GEN_latency(Xmm, "movdqu [mem+1]",
(g->movdqu(dst, g->ptr[g->rdx + 1])),
(g->movdqu(dst, g->ptr[g->rdx + g->rdi + 1])); (g->movq(g->rdi, dst)); ,
false, OT_FP32);
GEN_latency(Xmm, "movdqu [mem+63] (cross cache)",
(g->movdqu(dst, g->ptr[g->rdx + 63])),
(g->movdqu(dst, g->ptr[g->rdx + g->rdi + 63])); (g->movq(g->rdi, dst)); ,
false, OT_FP32);
GEN_latency(Xmm, "movdqu [mem+2MB-1] (cross page)",
(g->movdqu(dst, g->ptr[g->rdx + (2048*1024-1)])),
(g->movdqu(dst, g->ptr[g->rdx + g->rdi + (2048*1024-1)])); (g->movq(g->rdi, dst)); ,
false, OT_FP32);
if (info.have_sse42) {
GEN_throughput_only_rcx_clobber(Xmm, "pcmpistri",
(g->pcmpistri(src,src,0)),
false,OT_INT);
GEN_latency_only_rcx_clobber(Xmm, "pcmpistri->movq",
(g->pcmpistri(src,src,0));
(g->movq(src,g->rcx));
,
false,OT_INT);
GEN_throughput_only_rcx_clobber(Xmm, "pcmpistrm",
(g->pcmpistrm(g->xmm1,g->xmm1,0)),
false,OT_INT);
GEN_latency_only_rcx_clobber(Xmm, "pcmpistrm",
(g->pcmpistrm(g->xmm0,g->xmm0,0)),
false,OT_INT);
GEN_throughput_only_rcx_clobber(Xmm, "pcmpestri",
(g->pcmpestri(src,src,0)),
false,OT_INT);
GEN_latency_only_rcx_clobber(Xmm, "pcmpestri->movq",
(g->pcmpestri(src,src,0));
(g->movq(src,g->rcx));
,
false,OT_INT);
GEN_throughput_only_rcx_clobber(Xmm, "pcmpestrm",
(g->pcmpestrm(g->xmm1,g->xmm1,0)),
false,OT_INT);
GEN_latency_only_rcx_clobber(Xmm, "pcmpestrm",
(g->pcmpestrm(g->xmm0,g->xmm0,0)),
false,OT_INT);
}
}