1 | /* $NetBSD: subr_percpu.c,v 1.17 2014/11/27 15:00:00 uebayasi Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c)2007,2008 YAMAMOTO Takashi, |
5 | * All rights reserved. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions |
9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 | * SUCH DAMAGE. |
27 | */ |
28 | |
29 | /* |
30 | * per-cpu storage. |
31 | */ |
32 | |
33 | #include <sys/cdefs.h> |
34 | __KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.17 2014/11/27 15:00:00 uebayasi Exp $" ); |
35 | |
36 | #include <sys/param.h> |
37 | #include <sys/cpu.h> |
38 | #include <sys/kmem.h> |
39 | #include <sys/kernel.h> |
40 | #include <sys/mutex.h> |
41 | #include <sys/percpu.h> |
42 | #include <sys/rwlock.h> |
43 | #include <sys/vmem.h> |
44 | #include <sys/xcall.h> |
45 | |
46 | #define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1) |
47 | #define PERCPU_QCACHE_MAX 0 |
48 | #define PERCPU_IMPORT_SIZE 2048 |
49 | |
50 | #if defined(DIAGNOSTIC) |
51 | #define MAGIC 0x50435055 /* "PCPU" */ |
52 | #define percpu_encrypt(pc) ((pc) ^ MAGIC) |
53 | #define percpu_decrypt(pc) ((pc) ^ MAGIC) |
54 | #else /* defined(DIAGNOSTIC) */ |
55 | #define percpu_encrypt(pc) (pc) |
56 | #define percpu_decrypt(pc) (pc) |
57 | #endif /* defined(DIAGNOSTIC) */ |
58 | |
59 | static krwlock_t percpu_swap_lock __cacheline_aligned; |
60 | static kmutex_t percpu_allocation_lock __cacheline_aligned; |
61 | static vmem_t * percpu_offset_arena __cacheline_aligned; |
62 | static unsigned int percpu_nextoff __cacheline_aligned; |
63 | |
64 | static percpu_cpu_t * |
65 | cpu_percpu(struct cpu_info *ci) |
66 | { |
67 | |
68 | return &ci->ci_data.cpu_percpu; |
69 | } |
70 | |
71 | static unsigned int |
72 | percpu_offset(percpu_t *pc) |
73 | { |
74 | const unsigned int off = percpu_decrypt((uintptr_t)pc); |
75 | |
76 | KASSERT(off < percpu_nextoff); |
77 | return off; |
78 | } |
79 | |
80 | /* |
81 | * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge |
82 | */ |
83 | |
84 | static void |
85 | percpu_cpu_swap(void *p1, void *p2) |
86 | { |
87 | struct cpu_info * const ci = p1; |
88 | percpu_cpu_t * const newpcc = p2; |
89 | percpu_cpu_t * const pcc = cpu_percpu(ci); |
90 | |
91 | KASSERT(ci == curcpu() || !mp_online); |
92 | |
93 | /* |
94 | * swap *pcc and *newpcc unless anyone has beaten us. |
95 | */ |
96 | rw_enter(&percpu_swap_lock, RW_WRITER); |
97 | if (newpcc->pcc_size > pcc->pcc_size) { |
98 | percpu_cpu_t tmp; |
99 | int s; |
100 | |
101 | tmp = *pcc; |
102 | |
103 | /* |
104 | * block interrupts so that we don't lose their modifications. |
105 | */ |
106 | |
107 | s = splhigh(); |
108 | |
109 | /* |
110 | * copy data to new storage. |
111 | */ |
112 | |
113 | memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size); |
114 | |
115 | /* |
116 | * this assignment needs to be atomic for percpu_getptr_remote. |
117 | */ |
118 | |
119 | pcc->pcc_data = newpcc->pcc_data; |
120 | |
121 | splx(s); |
122 | |
123 | pcc->pcc_size = newpcc->pcc_size; |
124 | *newpcc = tmp; |
125 | } |
126 | rw_exit(&percpu_swap_lock); |
127 | } |
128 | |
129 | /* |
130 | * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space |
131 | */ |
132 | |
133 | static void |
134 | percpu_cpu_enlarge(size_t size) |
135 | { |
136 | CPU_INFO_ITERATOR cii; |
137 | struct cpu_info *ci; |
138 | |
139 | for (CPU_INFO_FOREACH(cii, ci)) { |
140 | percpu_cpu_t pcc; |
141 | |
142 | pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */ |
143 | pcc.pcc_size = size; |
144 | if (!mp_online) { |
145 | percpu_cpu_swap(ci, &pcc); |
146 | } else { |
147 | uint64_t where; |
148 | |
149 | where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci); |
150 | xc_wait(where); |
151 | } |
152 | KASSERT(pcc.pcc_size < size); |
153 | if (pcc.pcc_data != NULL) { |
154 | kmem_free(pcc.pcc_data, pcc.pcc_size); |
155 | } |
156 | } |
157 | } |
158 | |
159 | /* |
160 | * percpu_backend_alloc: vmem import callback for percpu_offset_arena |
161 | */ |
162 | |
163 | static int |
164 | percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize, |
165 | vm_flag_t vmflags, vmem_addr_t *addrp) |
166 | { |
167 | unsigned int offset; |
168 | unsigned int nextoff; |
169 | |
170 | ASSERT_SLEEPABLE(); |
171 | KASSERT(dummy == NULL); |
172 | |
173 | if ((vmflags & VM_NOSLEEP) != 0) |
174 | return ENOMEM; |
175 | |
176 | size = roundup(size, PERCPU_IMPORT_SIZE); |
177 | mutex_enter(&percpu_allocation_lock); |
178 | offset = percpu_nextoff; |
179 | percpu_nextoff = nextoff = percpu_nextoff + size; |
180 | mutex_exit(&percpu_allocation_lock); |
181 | |
182 | percpu_cpu_enlarge(nextoff); |
183 | |
184 | *resultsize = size; |
185 | *addrp = (vmem_addr_t)offset; |
186 | return 0; |
187 | } |
188 | |
189 | static void |
190 | percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci) |
191 | { |
192 | size_t sz = (uintptr_t)vp2; |
193 | |
194 | memset(vp, 0, sz); |
195 | } |
196 | |
197 | /* |
198 | * percpu_zero: initialize percpu storage with zero. |
199 | */ |
200 | |
201 | static void |
202 | percpu_zero(percpu_t *pc, size_t sz) |
203 | { |
204 | |
205 | percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz); |
206 | } |
207 | |
208 | /* |
209 | * percpu_init: subsystem initialization |
210 | */ |
211 | |
212 | void |
213 | percpu_init(void) |
214 | { |
215 | |
216 | ASSERT_SLEEPABLE(); |
217 | rw_init(&percpu_swap_lock); |
218 | mutex_init(&percpu_allocation_lock, MUTEX_DEFAULT, IPL_NONE); |
219 | percpu_nextoff = PERCPU_QUANTUM_SIZE; |
220 | |
221 | percpu_offset_arena = vmem_xcreate("percpu" , 0, 0, PERCPU_QUANTUM_SIZE, |
222 | percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP, |
223 | IPL_NONE); |
224 | } |
225 | |
226 | /* |
227 | * percpu_init_cpu: cpu initialization |
228 | * |
229 | * => should be called before the cpu appears on the list for CPU_INFO_FOREACH. |
230 | */ |
231 | |
232 | void |
233 | percpu_init_cpu(struct cpu_info *ci) |
234 | { |
235 | percpu_cpu_t * const pcc = cpu_percpu(ci); |
236 | size_t size = percpu_nextoff; /* XXX racy */ |
237 | |
238 | ASSERT_SLEEPABLE(); |
239 | pcc->pcc_size = size; |
240 | if (size) { |
241 | pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP); |
242 | } |
243 | } |
244 | |
245 | /* |
246 | * percpu_alloc: allocate percpu storage |
247 | * |
248 | * => called in thread context. |
249 | * => considered as an expensive and rare operation. |
250 | * => allocated storage is initialized with zeros. |
251 | */ |
252 | |
253 | percpu_t * |
254 | percpu_alloc(size_t size) |
255 | { |
256 | vmem_addr_t offset; |
257 | percpu_t *pc; |
258 | |
259 | ASSERT_SLEEPABLE(); |
260 | if (vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT, |
261 | &offset) != 0) |
262 | return NULL; |
263 | pc = (percpu_t *)percpu_encrypt((uintptr_t)offset); |
264 | percpu_zero(pc, size); |
265 | return pc; |
266 | } |
267 | |
268 | /* |
269 | * percpu_free: free percpu storage |
270 | * |
271 | * => called in thread context. |
272 | * => considered as an expensive and rare operation. |
273 | */ |
274 | |
275 | void |
276 | percpu_free(percpu_t *pc, size_t size) |
277 | { |
278 | |
279 | ASSERT_SLEEPABLE(); |
280 | vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size); |
281 | } |
282 | |
283 | /* |
284 | * percpu_getref: |
285 | * |
286 | * => safe to be used in either thread or interrupt context |
287 | * => disables preemption; must be bracketed with a percpu_putref() |
288 | */ |
289 | |
290 | void * |
291 | percpu_getref(percpu_t *pc) |
292 | { |
293 | |
294 | kpreempt_disable(); |
295 | return percpu_getptr_remote(pc, curcpu()); |
296 | } |
297 | |
298 | /* |
299 | * percpu_putref: |
300 | * |
301 | * => drops the preemption-disabled count after caller is done with per-cpu |
302 | * data |
303 | */ |
304 | |
305 | void |
306 | percpu_putref(percpu_t *pc) |
307 | { |
308 | |
309 | kpreempt_enable(); |
310 | } |
311 | |
312 | /* |
313 | * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote: |
314 | * helpers to access remote cpu's percpu data. |
315 | * |
316 | * => called in thread context. |
317 | * => percpu_traverse_enter can block low-priority xcalls. |
318 | * => typical usage would be: |
319 | * |
320 | * sum = 0; |
321 | * percpu_traverse_enter(); |
322 | * for (CPU_INFO_FOREACH(cii, ci)) { |
323 | * unsigned int *p = percpu_getptr_remote(pc, ci); |
324 | * sum += *p; |
325 | * } |
326 | * percpu_traverse_exit(); |
327 | */ |
328 | |
329 | void |
330 | percpu_traverse_enter(void) |
331 | { |
332 | |
333 | ASSERT_SLEEPABLE(); |
334 | rw_enter(&percpu_swap_lock, RW_READER); |
335 | } |
336 | |
337 | void |
338 | percpu_traverse_exit(void) |
339 | { |
340 | |
341 | rw_exit(&percpu_swap_lock); |
342 | } |
343 | |
344 | void * |
345 | percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci) |
346 | { |
347 | |
348 | return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)]; |
349 | } |
350 | |
351 | /* |
352 | * percpu_foreach: call the specified callback function for each cpus. |
353 | * |
354 | * => called in thread context. |
355 | * => caller should not rely on the cpu iteration order. |
356 | * => the callback function should be minimum because it is executed with |
357 | * holding a global lock, which can block low-priority xcalls. |
358 | * eg. it's illegal for a callback function to sleep for memory allocation. |
359 | */ |
360 | void |
361 | percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg) |
362 | { |
363 | CPU_INFO_ITERATOR cii; |
364 | struct cpu_info *ci; |
365 | |
366 | percpu_traverse_enter(); |
367 | for (CPU_INFO_FOREACH(cii, ci)) { |
368 | (*cb)(percpu_getptr_remote(pc, ci), arg, ci); |
369 | } |
370 | percpu_traverse_exit(); |
371 | } |
372 | |