5113f6f70
김현기
kernel add
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
#include <asm/asmmacro.h>
#include <asm/page.h>
#define PREFETCH_DIST 8
#define src0 r2
#define src1 r3
#define dst0 r9
#define dst1 r10
#define src_pre_mem r11
#define dst_pre_mem r14
#define src_pre_l2 r15
#define dst_pre_l2 r16
#define t1 r17
#define t2 r18
#define t3 r19
#define t4 r20
#define t5 t1
#define t6 t2
#define t7 t3
#define t9 t5
#define t10 t4
#define t11 t7
#define t12 t6
#define t14 t10
#define t13 r21
#define t15 r22
#define saved_lc r23
#define saved_pr r24
#define A 0
#define B (PREFETCH_DIST)
#define C (B + PREFETCH_DIST)
#define D (C + 3)
#define N (D + 1)
#define Nrot ((N + 7) & ~7)
GLOBAL_ENTRY(copy_page)
.prologue
alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
.rotr v[2*PREFETCH_DIST], n[D-C+1]
.rotp p[N]
.save ar.lc, saved_lc
mov saved_lc = ar.lc
.save pr, saved_pr
mov saved_pr = pr
.body
mov src_pre_mem = in1
mov pr.rot = 0x10000
mov ar.ec = 1
mov dst_pre_mem = in0
mov ar.lc = 2*PREFETCH_DIST - 1
add src_pre_l2 = 8*8, in1
add dst_pre_l2 = 8*8, in0
add src0 = 8, in1
add src1 = 3*8, in1
add dst0 = 8, in0
add dst1 = 3*8, in0
mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
nop.m 0
nop.i 0
;;
.prefetch_loop:
(p[A]) ld8 v[A] = [src_pre_mem], 128
(p[B]) st8 [dst_pre_mem] = v[B], 128
br.ctop.sptk .prefetch_loop
;;
cmp.eq p16, p0 = r0, r0
mov ar.lc = t1
mov ar.ec = N
;;
.line_copy:
(p[D]) ld8 t2 = [src0], 3*8
(p[D]) ld8 t4 = [src1], 3*8
(p[B]) st8 [dst_pre_mem] = v[B], 128
(p[D]) st8 [dst_pre_l2] = n[D-C], 128
;;
(p[A]) ld8 v[A] = [src_pre_mem], 128
(p[C]) ld8 n[0] = [src_pre_l2], 128
(p[D]) st8 [dst0] = t1, 8
(p[D]) st8 [dst1] = t3, 8
;;
(p[D]) ld8 t5 = [src0], 8
(p[D]) ld8 t7 = [src1], 3*8
(p[D]) st8 [dst0] = t2, 3*8
(p[D]) st8 [dst1] = t4, 3*8
;;
(p[D]) ld8 t6 = [src0], 3*8
(p[D]) ld8 t10 = [src1], 8
(p[D]) st8 [dst0] = t5, 8
(p[D]) st8 [dst1] = t7, 3*8
;;
(p[D]) ld8 t9 = [src0], 3*8
(p[D]) ld8 t11 = [src1], 3*8
(p[D]) st8 [dst0] = t6, 3*8
(p[D]) st8 [dst1] = t10, 8
;;
(p[D]) ld8 t12 = [src0], 8
(p[D]) ld8 t14 = [src1], 8
(p[D]) st8 [dst0] = t9, 3*8
(p[D]) st8 [dst1] = t11, 3*8
;;
(p[D]) ld8 t13 = [src0], 4*8
(p[D]) ld8 t15 = [src1], 4*8
(p[D]) st8 [dst0] = t12, 8
(p[D]) st8 [dst1] = t14, 8
;;
(p[D-1])ld8 t1 = [src0], 8
(p[D-1])ld8 t3 = [src1], 8
(p[D]) st8 [dst0] = t13, 4*8
(p[D]) st8 [dst1] = t15, 4*8
br.ctop.sptk .line_copy
;;
mov ar.lc = saved_lc
mov pr = saved_pr, -1
br.ret.sptk.many rp
END(copy_page)
|