1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
4 */
5
6#include <linux/linkage.h>
7#include <asm/cache.h>
8
9/*
10 * The memset implementation below is optimized to use prefetchw and prealloc
11 * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
12 * If you want to implement optimized memset for other possible L1 data cache
13 * line lengths (32B and 128B) you should rewrite code carefully checking
14 * we don't call any prefetchw/prealloc instruction for L1 cache lines which
15 * don't belongs to memset area.
16 */
17
18#if L1_CACHE_SHIFT == 6
19
20.macro PREALLOC_INSTR	reg, off
21	prealloc	[\reg, \off]
22.endm
23
24.macro PREFETCHW_INSTR	reg, off
25	prefetchw	[\reg, \off]
26.endm
27
28#else
29
30.macro PREALLOC_INSTR	reg, off
31.endm
32
33.macro PREFETCHW_INSTR	reg, off
34.endm
35
36#endif
37
38ENTRY_CFI(memset)
39	mov.f	0, r2
40;;; if size is zero
41	jz.d	[blink]
42	mov	r3, r0		; don't clobber ret val
43
44	PREFETCHW_INSTR	r0, 0	; Prefetch the first write location
45
46;;; if length < 8
47	brls.d.nt	r2, 8, .Lsmallchunk
48	mov.f	lp_count,r2
49
50	and.f	r4, r0, 0x03
51	rsub	lp_count, r4, 4
52	lpnz	@.Laligndestination
53	;; LOOP BEGIN
54	stb.ab	r1, [r3,1]
55	sub	r2, r2, 1
56.Laligndestination:
57
58;;; Destination is aligned
59	and	r1, r1, 0xFF
60	asl	r4, r1, 8
61	or	r4, r4, r1
62	asl	r5, r4, 16
63	or	r5, r5, r4
64	mov	r4, r5
65
66	sub3	lp_count, r2, 8
67	cmp     r2, 64
68	bmsk.hi	r2, r2, 5
69	mov.ls	lp_count, 0
70	add3.hi	r2, r2, 8
71
72;;; Convert len to Dwords, unfold x8
73	lsr.f	lp_count, lp_count, 6
74
75	lpnz	@.Lset64bytes
76	;; LOOP START
77	PREALLOC_INSTR	r3, 64	; alloc next line w/o fetching
78
79#ifdef CONFIG_ARC_HAS_LL64
80	std.ab	r4, [r3, 8]
81	std.ab	r4, [r3, 8]
82	std.ab	r4, [r3, 8]
83	std.ab	r4, [r3, 8]
84	std.ab	r4, [r3, 8]
85	std.ab	r4, [r3, 8]
86	std.ab	r4, [r3, 8]
87	std.ab	r4, [r3, 8]
88#else
89	st.ab	r4, [r3, 4]
90	st.ab	r4, [r3, 4]
91	st.ab	r4, [r3, 4]
92	st.ab	r4, [r3, 4]
93	st.ab	r4, [r3, 4]
94	st.ab	r4, [r3, 4]
95	st.ab	r4, [r3, 4]
96	st.ab	r4, [r3, 4]
97	st.ab	r4, [r3, 4]
98	st.ab	r4, [r3, 4]
99	st.ab	r4, [r3, 4]
100	st.ab	r4, [r3, 4]
101	st.ab	r4, [r3, 4]
102	st.ab	r4, [r3, 4]
103	st.ab	r4, [r3, 4]
104	st.ab	r4, [r3, 4]
105#endif
106.Lset64bytes:
107
108	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
109	lpnz	.Lset32bytes
110	;; LOOP START
111#ifdef CONFIG_ARC_HAS_LL64
112	std.ab	r4, [r3, 8]
113	std.ab	r4, [r3, 8]
114	std.ab	r4, [r3, 8]
115	std.ab	r4, [r3, 8]
116#else
117	st.ab	r4, [r3, 4]
118	st.ab	r4, [r3, 4]
119	st.ab	r4, [r3, 4]
120	st.ab	r4, [r3, 4]
121	st.ab	r4, [r3, 4]
122	st.ab	r4, [r3, 4]
123	st.ab	r4, [r3, 4]
124	st.ab	r4, [r3, 4]
125#endif
126.Lset32bytes:
127
128	and.f	lp_count, r2, 0x1F ;Last remaining 31 bytes
129.Lsmallchunk:
130	lpnz	.Lcopy3bytes
131	;; LOOP START
132	stb.ab	r1, [r3, 1]
133.Lcopy3bytes:
134
135	j	[blink]
136
137END_CFI(memset)
138
139ENTRY_CFI(memzero)
140    ; adjust bzero args to memset args
141    mov r2, r1
142    b.d  memset    ;tail call so need to tinker with blink
143    mov r1, 0
144END_CFI(memzero)
145