memcpy.S 6.73 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301


/*
 *
 * Optimized version of the standard memcpy() function
 *
 * Inputs:
 * 	in0:	destination address
 *	in1:	source address
 *	in2:	number of bytes to copy
 * Output:
 * 	no return value
 *
 * Copyright (C) 2000-2001 Hewlett-Packard Co
 *	Stephane Eranian <eranian@hpl.hp.com>
 *	David Mosberger-Tang <davidm@hpl.hp.com>
 */
#include <asm/asmmacro.h>

GLOBAL_ENTRY(memcpy)

#	define MEM_LAT	21		/* latency to memory */

#	define dst	r2
#	define src	r3
#	define retval	r8
#	define saved_pfs r9
#	define saved_lc	r10
#	define saved_pr	r11
#	define cnt	r16
#	define src2	r17
#	define t0	r18
#	define t1	r19
#	define t2	r20
#	define t3	r21
#	define t4	r22
#	define src_end	r23

#	define N	(MEM_LAT + 4)
#	define Nrot	((N + 7) & ~7)

	/*
	 * First, check if everything (src, dst, len) is a multiple of eight.  If
	 * so, we handle everything with no taken branches (other than the loop
	 * itself) and a small icache footprint.  Otherwise, we jump off to
	 * the more general copy routine handling arbitrary
	 * sizes/alignment etc.
	 */
	.prologue
	.save ar.pfs, saved_pfs
	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
	.save ar.lc, saved_lc
	mov saved_lc=ar.lc
	or t0=in0,in1
	;;

	or t0=t0,in2
	.save pr, saved_pr
	mov saved_pr=pr

	.body

	cmp.eq p6,p0=in2,r0	// zero length?
	mov retval=in0		// return dst
(p6)	br.ret.spnt.many rp	// zero length, return immediately
	;;

	mov dst=in0		// copy because of rotation
	shr.u cnt=in2,3		// number of 8-byte words to copy
	mov pr.rot=1<<16
	;;

	adds cnt=-1,cnt		// br.ctop is repeat/until
	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
	mov ar.ec=N
	;;

	and t0=0x7,t0
	mov ar.lc=cnt
	;;
	cmp.ne p6,p0=t0,r0

	mov src=in1		// copy because of rotation
(p7)	br.cond.spnt.few .memcpy_short
(p6)	br.cond.spnt.few .memcpy_long
	;;
	nop.m	0
	;;
	nop.m	0
	nop.i	0
	;;
	nop.m	0
	;;
	.rotr val[N]
	.rotp p[N]
	.align 32
1: { .mib
(p[0])	ld8 val[0]=[src],8
	nop.i 0
	brp.loop.imp 1b, 2f
}
2: { .mfb
(p[N-1])st8 [dst]=val[N-1],8
	nop.f 0
	br.ctop.dptk.few 1b
}
	;;
	mov ar.lc=saved_lc
	mov pr=saved_pr,-1
	mov ar.pfs=saved_pfs
	br.ret.sptk.many rp

	/*
	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
	 * get used very often (gcc inlines small copies) and due to atomicity
	 * issues, we want to avoid read-modify-write of entire words.
	 */
	.align 32
.memcpy_short:
	adds cnt=-1,in2		// br.ctop is repeat/until
	mov ar.ec=MEM_LAT
	brp.loop.imp 1f, 2f
	;;
	mov ar.lc=cnt
	;;
	nop.m	0
	;;
	nop.m	0
	nop.i	0
	;;
	nop.m	0
	;;
	nop.m	0
	;;
	/*
	 * It is faster to put a stop bit in the loop here because it makes
	 * the pipeline shorter (and latency is what matters on short copies).
	 */
	.align 32
1: { .mib
(p[0])	ld1 val[0]=[src],1
	nop.i 0
	brp.loop.imp 1b, 2f
} ;;
2: { .mfb
(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
	nop.f 0
	br.ctop.dptk.few 1b
} ;;
	mov ar.lc=saved_lc
	mov pr=saved_pr,-1
	mov ar.pfs=saved_pfs
	br.ret.sptk.many rp

	/*
	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
	 * an overriding concern here, but throughput is.  We first do
	 * sub-word copying until the destination is aligned, then we check
	 * if the source is also aligned.  If so, we do a simple load/store-loop
	 * until there are less than 8 bytes left over and then we do the tail,
	 * by storing the last few bytes using sub-word copying.  If the source
	 * is not aligned, we branch off to the non-congruent loop.
	 *
	 *   stage:   op:
	 *         0  ld
	 *	   :
	 * MEM_LAT+3  shrp
	 * MEM_LAT+4  st
	 *
	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
	 * seems to introduce an unavoidable bubble in the pipeline so the overall
	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
	 * of 4 byte/cycle.  Still not bad.
	 */
#	undef N
#	undef Nrot
#	define N	(MEM_LAT + 5)		/* number of stages */
#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */

#define LOG_LOOP_SIZE	6

.memcpy_long:
	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
	and t0=-8,src		// t0 = src & ~7
	and t2=7,src		// t2 = src & 7
	;;
	ld8 t0=[t0]		// t0 = 1st source word
	adds src2=7,src		// src2 = (src + 7)
	sub t4=r0,dst		// t4 = -dst
	;;
	and src2=-8,src2	// src2 = (src + 7) & ~7
	shl t2=t2,3		// t2 = 8*(src & 7)
	shl t4=t4,3		// t4 = 8*(dst & 7)
	;;
	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
	sub t3=64,t2		// t3 = 64-8*(src & 7)
	shr.u t0=t0,t2
	;;
	add src_end=src,in2
	shl t1=t1,t3
	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
	;;
	or t0=t0,t1
	mov cnt=r0
	adds src_end=-1,src_end
	;;
(p3)	st1 [dst]=t0,1
(p3)	shr.u t0=t0,8
(p3)	adds cnt=1,cnt
	;;
(p4)	st2 [dst]=t0,2
(p4)	shr.u t0=t0,16
(p4)	adds cnt=2,cnt
	;;
(p5)	st4 [dst]=t0,4
(p5)	adds cnt=4,cnt
	and src_end=-8,src_end	// src_end = last word of source buffer
	;;

	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:

1:{	add src=cnt,src			// make src point to remainder of source buffer
	sub cnt=in2,cnt			// cnt = number of bytes left to copy
	mov t4=ip
  }	;;
	and src2=-8,src			// align source pointer
	adds t4=.memcpy_loops-1b,t4
	mov ar.ec=N

	and t0=7,src			// t0 = src & 7
	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
	shl cnt=cnt,3			// move bits 0-2 to 3-5
	;;

	.rotr val[N+1], w[2]
	.rotp p[N]

	cmp.ne p6,p0=t0,r0		// is src aligned, too?
	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
	adds t2=-1,t2			// br.ctop is repeat/until
	;;
	add t4=t0,t4
	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
	mov ar.lc=t2
	;;
	nop.m	0
	;;
	nop.m	0
	nop.i	0
	;;
	nop.m	0
	;;
(p6)	ld8 val[1]=[src2],8		// prime the pump...
	mov b6=t4
	br.sptk.few b6
	;;

.memcpy_tail:
	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
	// less than 8) and t0 contains the last few bytes of the src buffer:
(p5)	st4 [dst]=t0,4
(p5)	shr.u t0=t0,32
	mov ar.lc=saved_lc
	;;
(p4)	st2 [dst]=t0,2
(p4)	shr.u t0=t0,16
	mov ar.pfs=saved_pfs
	;;
(p3)	st1 [dst]=t0
	mov pr=saved_pr,-1
	br.ret.sptk.many rp

///////////////////////////////////////////////////////
	.align 64

#define COPY(shift,index)									\
 1: { .mib											\
	(p[0])		ld8 val[0]=[src2],8;							\
	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
			brp.loop.imp 1b, 2f							\
    };												\
 2: { .mfb											\
	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
			nop.f 0;								\
			br.ctop.dptk.few 1b;							\
    };												\
			;;									\
			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
			;;									\
			shrp t0=val[N-1],val[N-index],shift;					\
			br .memcpy_tail
.memcpy_loops:
	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
	COPY(8, 0)
	COPY(16, 0)
	COPY(24, 0)
	COPY(32, 0)
	COPY(40, 0)
	COPY(48, 0)
	COPY(56, 0)

END(memcpy)