1  // SPDX-License-Identifier: GPL-2.0-only
2  /* Copyright (c) 2016 Facebook
3   */
4  #include <linux/bpf.h>
5  #include <linux/jhash.h>
6  #include <linux/filter.h>
7  #include <linux/kernel.h>
8  #include <linux/stacktrace.h>
9  #include <linux/perf_event.h>
10  #include <linux/btf_ids.h>
11  #include <linux/buildid.h>
12  #include "percpu_freelist.h"
13  #include "mmap_unlock_work.h"
14  
15  #define STACK_CREATE_FLAG_MASK					\
16  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\
17  	 BPF_F_STACK_BUILD_ID)
18  
19  struct stack_map_bucket {
20  	struct pcpu_freelist_node fnode;
21  	u32 hash;
22  	u32 nr;
23  	u64 data[];
24  };
25  
26  struct bpf_stack_map {
27  	struct bpf_map map;
28  	void *elems;
29  	struct pcpu_freelist freelist;
30  	u32 n_buckets;
31  	struct stack_map_bucket *buckets[] __counted_by(n_buckets);
32  };
33  
stack_map_use_build_id(struct bpf_map * map)34  static inline bool stack_map_use_build_id(struct bpf_map *map)
35  {
36  	return (map->map_flags & BPF_F_STACK_BUILD_ID);
37  }
38  
stack_map_data_size(struct bpf_map * map)39  static inline int stack_map_data_size(struct bpf_map *map)
40  {
41  	return stack_map_use_build_id(map) ?
42  		sizeof(struct bpf_stack_build_id) : sizeof(u64);
43  }
44  
prealloc_elems_and_freelist(struct bpf_stack_map * smap)45  static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
46  {
47  	u64 elem_size = sizeof(struct stack_map_bucket) +
48  			(u64)smap->map.value_size;
49  	int err;
50  
51  	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
52  					 smap->map.numa_node);
53  	if (!smap->elems)
54  		return -ENOMEM;
55  
56  	err = pcpu_freelist_init(&smap->freelist);
57  	if (err)
58  		goto free_elems;
59  
60  	pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
61  			       smap->map.max_entries);
62  	return 0;
63  
64  free_elems:
65  	bpf_map_area_free(smap->elems);
66  	return err;
67  }
68  
69  /* Called from syscall */
stack_map_alloc(union bpf_attr * attr)70  static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
71  {
72  	u32 value_size = attr->value_size;
73  	struct bpf_stack_map *smap;
74  	u64 cost, n_buckets;
75  	int err;
76  
77  	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
78  		return ERR_PTR(-EINVAL);
79  
80  	/* check sanity of attributes */
81  	if (attr->max_entries == 0 || attr->key_size != 4 ||
82  	    value_size < 8 || value_size % 8)
83  		return ERR_PTR(-EINVAL);
84  
85  	BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
86  	if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
87  		if (value_size % sizeof(struct bpf_stack_build_id) ||
88  		    value_size / sizeof(struct bpf_stack_build_id)
89  		    > sysctl_perf_event_max_stack)
90  			return ERR_PTR(-EINVAL);
91  	} else if (value_size / 8 > sysctl_perf_event_max_stack)
92  		return ERR_PTR(-EINVAL);
93  
94  	/* hash table size must be power of 2; roundup_pow_of_two() can overflow
95  	 * into UB on 32-bit arches, so check that first
96  	 */
97  	if (attr->max_entries > 1UL << 31)
98  		return ERR_PTR(-E2BIG);
99  
100  	n_buckets = roundup_pow_of_two(attr->max_entries);
101  
102  	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
103  	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
104  	if (!smap)
105  		return ERR_PTR(-ENOMEM);
106  
107  	bpf_map_init_from_attr(&smap->map, attr);
108  	smap->n_buckets = n_buckets;
109  
110  	err = get_callchain_buffers(sysctl_perf_event_max_stack);
111  	if (err)
112  		goto free_smap;
113  
114  	err = prealloc_elems_and_freelist(smap);
115  	if (err)
116  		goto put_buffers;
117  
118  	return &smap->map;
119  
120  put_buffers:
121  	put_callchain_buffers();
122  free_smap:
123  	bpf_map_area_free(smap);
124  	return ERR_PTR(err);
125  }
126  
fetch_build_id(struct vm_area_struct * vma,unsigned char * build_id,bool may_fault)127  static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
128  {
129  	return may_fault ? build_id_parse(vma, build_id, NULL)
130  			 : build_id_parse_nofault(vma, build_id, NULL);
131  }
132  
133  /*
134   * Expects all id_offs[i].ip values to be set to correct initial IPs.
135   * They will be subsequently:
136   *   - either adjusted in place to a file offset, if build ID fetching
137   *     succeeds; in this case id_offs[i].build_id is set to correct build ID,
138   *     and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
139   *   - or IP will be kept intact, if build ID fetching failed; in this case
140   *     id_offs[i].build_id is zeroed out and id_offs[i].status is set to
141   *     BPF_STACK_BUILD_ID_IP.
142   */
stack_map_get_build_id_offset(struct bpf_stack_build_id * id_offs,u32 trace_nr,bool user,bool may_fault)143  static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
144  					  u32 trace_nr, bool user, bool may_fault)
145  {
146  	int i;
147  	struct mmap_unlock_irq_work *work = NULL;
148  	bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
149  	struct vm_area_struct *vma, *prev_vma = NULL;
150  	const char *prev_build_id;
151  
152  	/* If the irq_work is in use, fall back to report ips. Same
153  	 * fallback is used for kernel stack (!user) on a stackmap with
154  	 * build_id.
155  	 */
156  	if (!user || !current || !current->mm || irq_work_busy ||
157  	    !mmap_read_trylock(current->mm)) {
158  		/* cannot access current->mm, fall back to ips */
159  		for (i = 0; i < trace_nr; i++) {
160  			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
161  			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
162  		}
163  		return;
164  	}
165  
166  	for (i = 0; i < trace_nr; i++) {
167  		u64 ip = READ_ONCE(id_offs[i].ip);
168  
169  		if (range_in_vma(prev_vma, ip, ip)) {
170  			vma = prev_vma;
171  			memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
172  			goto build_id_valid;
173  		}
174  		vma = find_vma(current->mm, ip);
175  		if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
176  			/* per entry fall back to ips */
177  			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
178  			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
179  			continue;
180  		}
181  build_id_valid:
182  		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
183  		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
184  		prev_vma = vma;
185  		prev_build_id = id_offs[i].build_id;
186  	}
187  	bpf_mmap_unlock_mm(work, current->mm);
188  }
189  
190  static struct perf_callchain_entry *
get_callchain_entry_for_task(struct task_struct * task,u32 max_depth)191  get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
192  {
193  #ifdef CONFIG_STACKTRACE
194  	struct perf_callchain_entry *entry;
195  	int rctx;
196  
197  	entry = get_callchain_entry(&rctx);
198  
199  	if (!entry)
200  		return NULL;
201  
202  	entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
203  					 max_depth, 0);
204  
205  	/* stack_trace_save_tsk() works on unsigned long array, while
206  	 * perf_callchain_entry uses u64 array. For 32-bit systems, it is
207  	 * necessary to fix this mismatch.
208  	 */
209  	if (__BITS_PER_LONG != 64) {
210  		unsigned long *from = (unsigned long *) entry->ip;
211  		u64 *to = entry->ip;
212  		int i;
213  
214  		/* copy data from the end to avoid using extra buffer */
215  		for (i = entry->nr - 1; i >= 0; i--)
216  			to[i] = (u64)(from[i]);
217  	}
218  
219  	put_callchain_entry(rctx);
220  
221  	return entry;
222  #else /* CONFIG_STACKTRACE */
223  	return NULL;
224  #endif
225  }
226  
__bpf_get_stackid(struct bpf_map * map,struct perf_callchain_entry * trace,u64 flags)227  static long __bpf_get_stackid(struct bpf_map *map,
228  			      struct perf_callchain_entry *trace, u64 flags)
229  {
230  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
231  	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
232  	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
233  	u32 hash, id, trace_nr, trace_len, i;
234  	bool user = flags & BPF_F_USER_STACK;
235  	u64 *ips;
236  	bool hash_matches;
237  
238  	if (trace->nr <= skip)
239  		/* skipping more than usable stack trace */
240  		return -EFAULT;
241  
242  	trace_nr = trace->nr - skip;
243  	trace_len = trace_nr * sizeof(u64);
244  	ips = trace->ip + skip;
245  	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
246  	id = hash & (smap->n_buckets - 1);
247  	bucket = READ_ONCE(smap->buckets[id]);
248  
249  	hash_matches = bucket && bucket->hash == hash;
250  	/* fast cmp */
251  	if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
252  		return id;
253  
254  	if (stack_map_use_build_id(map)) {
255  		struct bpf_stack_build_id *id_offs;
256  
257  		/* for build_id+offset, pop a bucket before slow cmp */
258  		new_bucket = (struct stack_map_bucket *)
259  			pcpu_freelist_pop(&smap->freelist);
260  		if (unlikely(!new_bucket))
261  			return -ENOMEM;
262  		new_bucket->nr = trace_nr;
263  		id_offs = (struct bpf_stack_build_id *)new_bucket->data;
264  		for (i = 0; i < trace_nr; i++)
265  			id_offs[i].ip = ips[i];
266  		stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
267  		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
268  		if (hash_matches && bucket->nr == trace_nr &&
269  		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
270  			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
271  			return id;
272  		}
273  		if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
274  			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
275  			return -EEXIST;
276  		}
277  	} else {
278  		if (hash_matches && bucket->nr == trace_nr &&
279  		    memcmp(bucket->data, ips, trace_len) == 0)
280  			return id;
281  		if (bucket && !(flags & BPF_F_REUSE_STACKID))
282  			return -EEXIST;
283  
284  		new_bucket = (struct stack_map_bucket *)
285  			pcpu_freelist_pop(&smap->freelist);
286  		if (unlikely(!new_bucket))
287  			return -ENOMEM;
288  		memcpy(new_bucket->data, ips, trace_len);
289  	}
290  
291  	new_bucket->hash = hash;
292  	new_bucket->nr = trace_nr;
293  
294  	old_bucket = xchg(&smap->buckets[id], new_bucket);
295  	if (old_bucket)
296  		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
297  	return id;
298  }
299  
BPF_CALL_3(bpf_get_stackid,struct pt_regs *,regs,struct bpf_map *,map,u64,flags)300  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
301  	   u64, flags)
302  {
303  	u32 max_depth = map->value_size / stack_map_data_size(map);
304  	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
305  	bool user = flags & BPF_F_USER_STACK;
306  	struct perf_callchain_entry *trace;
307  	bool kernel = !user;
308  
309  	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
310  			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
311  		return -EINVAL;
312  
313  	max_depth += skip;
314  	if (max_depth > sysctl_perf_event_max_stack)
315  		max_depth = sysctl_perf_event_max_stack;
316  
317  	trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
318  				   false, false);
319  
320  	if (unlikely(!trace))
321  		/* couldn't fetch the stack trace */
322  		return -EFAULT;
323  
324  	return __bpf_get_stackid(map, trace, flags);
325  }
326  
327  const struct bpf_func_proto bpf_get_stackid_proto = {
328  	.func		= bpf_get_stackid,
329  	.gpl_only	= true,
330  	.ret_type	= RET_INTEGER,
331  	.arg1_type	= ARG_PTR_TO_CTX,
332  	.arg2_type	= ARG_CONST_MAP_PTR,
333  	.arg3_type	= ARG_ANYTHING,
334  };
335  
count_kernel_ip(struct perf_callchain_entry * trace)336  static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
337  {
338  	__u64 nr_kernel = 0;
339  
340  	while (nr_kernel < trace->nr) {
341  		if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
342  			break;
343  		nr_kernel++;
344  	}
345  	return nr_kernel;
346  }
347  
BPF_CALL_3(bpf_get_stackid_pe,struct bpf_perf_event_data_kern *,ctx,struct bpf_map *,map,u64,flags)348  BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
349  	   struct bpf_map *, map, u64, flags)
350  {
351  	struct perf_event *event = ctx->event;
352  	struct perf_callchain_entry *trace;
353  	bool kernel, user;
354  	__u64 nr_kernel;
355  	int ret;
356  
357  	/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
358  	if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
359  		return bpf_get_stackid((unsigned long)(ctx->regs),
360  				       (unsigned long) map, flags, 0, 0);
361  
362  	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
363  			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
364  		return -EINVAL;
365  
366  	user = flags & BPF_F_USER_STACK;
367  	kernel = !user;
368  
369  	trace = ctx->data->callchain;
370  	if (unlikely(!trace))
371  		return -EFAULT;
372  
373  	nr_kernel = count_kernel_ip(trace);
374  
375  	if (kernel) {
376  		__u64 nr = trace->nr;
377  
378  		trace->nr = nr_kernel;
379  		ret = __bpf_get_stackid(map, trace, flags);
380  
381  		/* restore nr */
382  		trace->nr = nr;
383  	} else { /* user */
384  		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
385  
386  		skip += nr_kernel;
387  		if (skip > BPF_F_SKIP_FIELD_MASK)
388  			return -EFAULT;
389  
390  		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
391  		ret = __bpf_get_stackid(map, trace, flags);
392  	}
393  	return ret;
394  }
395  
396  const struct bpf_func_proto bpf_get_stackid_proto_pe = {
397  	.func		= bpf_get_stackid_pe,
398  	.gpl_only	= false,
399  	.ret_type	= RET_INTEGER,
400  	.arg1_type	= ARG_PTR_TO_CTX,
401  	.arg2_type	= ARG_CONST_MAP_PTR,
402  	.arg3_type	= ARG_ANYTHING,
403  };
404  
__bpf_get_stack(struct pt_regs * regs,struct task_struct * task,struct perf_callchain_entry * trace_in,void * buf,u32 size,u64 flags,bool may_fault)405  static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
406  			    struct perf_callchain_entry *trace_in,
407  			    void *buf, u32 size, u64 flags, bool may_fault)
408  {
409  	u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
410  	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
411  	bool crosstask = task && task != current;
412  	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
413  	bool user = flags & BPF_F_USER_STACK;
414  	struct perf_callchain_entry *trace;
415  	bool kernel = !user;
416  	int err = -EINVAL;
417  	u64 *ips;
418  
419  	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
420  			       BPF_F_USER_BUILD_ID)))
421  		goto clear;
422  	if (kernel && user_build_id)
423  		goto clear;
424  
425  	elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
426  	if (unlikely(size % elem_size))
427  		goto clear;
428  
429  	/* cannot get valid user stack for task without user_mode regs */
430  	if (task && user && !user_mode(regs))
431  		goto err_fault;
432  
433  	/* get_perf_callchain does not support crosstask user stack walking
434  	 * but returns an empty stack instead of NULL.
435  	 */
436  	if (crosstask && user) {
437  		err = -EOPNOTSUPP;
438  		goto clear;
439  	}
440  
441  	num_elem = size / elem_size;
442  	max_depth = num_elem + skip;
443  	if (sysctl_perf_event_max_stack < max_depth)
444  		max_depth = sysctl_perf_event_max_stack;
445  
446  	if (may_fault)
447  		rcu_read_lock(); /* need RCU for perf's callchain below */
448  
449  	if (trace_in)
450  		trace = trace_in;
451  	else if (kernel && task)
452  		trace = get_callchain_entry_for_task(task, max_depth);
453  	else
454  		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
455  					   crosstask, false);
456  
457  	if (unlikely(!trace) || trace->nr < skip) {
458  		if (may_fault)
459  			rcu_read_unlock();
460  		goto err_fault;
461  	}
462  
463  	trace_nr = trace->nr - skip;
464  	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
465  	copy_len = trace_nr * elem_size;
466  
467  	ips = trace->ip + skip;
468  	if (user_build_id) {
469  		struct bpf_stack_build_id *id_offs = buf;
470  		u32 i;
471  
472  		for (i = 0; i < trace_nr; i++)
473  			id_offs[i].ip = ips[i];
474  	} else {
475  		memcpy(buf, ips, copy_len);
476  	}
477  
478  	/* trace/ips should not be dereferenced after this point */
479  	if (may_fault)
480  		rcu_read_unlock();
481  
482  	if (user_build_id)
483  		stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
484  
485  	if (size > copy_len)
486  		memset(buf + copy_len, 0, size - copy_len);
487  	return copy_len;
488  
489  err_fault:
490  	err = -EFAULT;
491  clear:
492  	memset(buf, 0, size);
493  	return err;
494  }
495  
BPF_CALL_4(bpf_get_stack,struct pt_regs *,regs,void *,buf,u32,size,u64,flags)496  BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
497  	   u64, flags)
498  {
499  	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
500  }
501  
502  const struct bpf_func_proto bpf_get_stack_proto = {
503  	.func		= bpf_get_stack,
504  	.gpl_only	= true,
505  	.ret_type	= RET_INTEGER,
506  	.arg1_type	= ARG_PTR_TO_CTX,
507  	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
508  	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
509  	.arg4_type	= ARG_ANYTHING,
510  };
511  
BPF_CALL_4(bpf_get_stack_sleepable,struct pt_regs *,regs,void *,buf,u32,size,u64,flags)512  BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
513  	   u64, flags)
514  {
515  	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
516  }
517  
518  const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
519  	.func		= bpf_get_stack_sleepable,
520  	.gpl_only	= true,
521  	.ret_type	= RET_INTEGER,
522  	.arg1_type	= ARG_PTR_TO_CTX,
523  	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
524  	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
525  	.arg4_type	= ARG_ANYTHING,
526  };
527  
__bpf_get_task_stack(struct task_struct * task,void * buf,u32 size,u64 flags,bool may_fault)528  static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
529  				 u64 flags, bool may_fault)
530  {
531  	struct pt_regs *regs;
532  	long res = -EINVAL;
533  
534  	if (!try_get_task_stack(task))
535  		return -EFAULT;
536  
537  	regs = task_pt_regs(task);
538  	if (regs)
539  		res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
540  	put_task_stack(task);
541  
542  	return res;
543  }
544  
BPF_CALL_4(bpf_get_task_stack,struct task_struct *,task,void *,buf,u32,size,u64,flags)545  BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
546  	   u32, size, u64, flags)
547  {
548  	return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
549  }
550  
551  const struct bpf_func_proto bpf_get_task_stack_proto = {
552  	.func		= bpf_get_task_stack,
553  	.gpl_only	= false,
554  	.ret_type	= RET_INTEGER,
555  	.arg1_type	= ARG_PTR_TO_BTF_ID,
556  	.arg1_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
557  	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
558  	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
559  	.arg4_type	= ARG_ANYTHING,
560  };
561  
BPF_CALL_4(bpf_get_task_stack_sleepable,struct task_struct *,task,void *,buf,u32,size,u64,flags)562  BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
563  	   u32, size, u64, flags)
564  {
565  	return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
566  }
567  
568  const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
569  	.func		= bpf_get_task_stack_sleepable,
570  	.gpl_only	= false,
571  	.ret_type	= RET_INTEGER,
572  	.arg1_type	= ARG_PTR_TO_BTF_ID,
573  	.arg1_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
574  	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
575  	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
576  	.arg4_type	= ARG_ANYTHING,
577  };
578  
BPF_CALL_4(bpf_get_stack_pe,struct bpf_perf_event_data_kern *,ctx,void *,buf,u32,size,u64,flags)579  BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
580  	   void *, buf, u32, size, u64, flags)
581  {
582  	struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
583  	struct perf_event *event = ctx->event;
584  	struct perf_callchain_entry *trace;
585  	bool kernel, user;
586  	int err = -EINVAL;
587  	__u64 nr_kernel;
588  
589  	if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
590  		return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
591  
592  	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
593  			       BPF_F_USER_BUILD_ID)))
594  		goto clear;
595  
596  	user = flags & BPF_F_USER_STACK;
597  	kernel = !user;
598  
599  	err = -EFAULT;
600  	trace = ctx->data->callchain;
601  	if (unlikely(!trace))
602  		goto clear;
603  
604  	nr_kernel = count_kernel_ip(trace);
605  
606  	if (kernel) {
607  		__u64 nr = trace->nr;
608  
609  		trace->nr = nr_kernel;
610  		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
611  
612  		/* restore nr */
613  		trace->nr = nr;
614  	} else { /* user */
615  		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
616  
617  		skip += nr_kernel;
618  		if (skip > BPF_F_SKIP_FIELD_MASK)
619  			goto clear;
620  
621  		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
622  		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
623  	}
624  	return err;
625  
626  clear:
627  	memset(buf, 0, size);
628  	return err;
629  
630  }
631  
632  const struct bpf_func_proto bpf_get_stack_proto_pe = {
633  	.func		= bpf_get_stack_pe,
634  	.gpl_only	= true,
635  	.ret_type	= RET_INTEGER,
636  	.arg1_type	= ARG_PTR_TO_CTX,
637  	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
638  	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
639  	.arg4_type	= ARG_ANYTHING,
640  };
641  
642  /* Called from eBPF program */
stack_map_lookup_elem(struct bpf_map * map,void * key)643  static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
644  {
645  	return ERR_PTR(-EOPNOTSUPP);
646  }
647  
648  /* Called from syscall */
bpf_stackmap_copy(struct bpf_map * map,void * key,void * value)649  int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
650  {
651  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
652  	struct stack_map_bucket *bucket, *old_bucket;
653  	u32 id = *(u32 *)key, trace_len;
654  
655  	if (unlikely(id >= smap->n_buckets))
656  		return -ENOENT;
657  
658  	bucket = xchg(&smap->buckets[id], NULL);
659  	if (!bucket)
660  		return -ENOENT;
661  
662  	trace_len = bucket->nr * stack_map_data_size(map);
663  	memcpy(value, bucket->data, trace_len);
664  	memset(value + trace_len, 0, map->value_size - trace_len);
665  
666  	old_bucket = xchg(&smap->buckets[id], bucket);
667  	if (old_bucket)
668  		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
669  	return 0;
670  }
671  
stack_map_get_next_key(struct bpf_map * map,void * key,void * next_key)672  static int stack_map_get_next_key(struct bpf_map *map, void *key,
673  				  void *next_key)
674  {
675  	struct bpf_stack_map *smap = container_of(map,
676  						  struct bpf_stack_map, map);
677  	u32 id;
678  
679  	WARN_ON_ONCE(!rcu_read_lock_held());
680  
681  	if (!key) {
682  		id = 0;
683  	} else {
684  		id = *(u32 *)key;
685  		if (id >= smap->n_buckets || !smap->buckets[id])
686  			id = 0;
687  		else
688  			id++;
689  	}
690  
691  	while (id < smap->n_buckets && !smap->buckets[id])
692  		id++;
693  
694  	if (id >= smap->n_buckets)
695  		return -ENOENT;
696  
697  	*(u32 *)next_key = id;
698  	return 0;
699  }
700  
stack_map_update_elem(struct bpf_map * map,void * key,void * value,u64 map_flags)701  static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
702  				  u64 map_flags)
703  {
704  	return -EINVAL;
705  }
706  
707  /* Called from syscall or from eBPF program */
stack_map_delete_elem(struct bpf_map * map,void * key)708  static long stack_map_delete_elem(struct bpf_map *map, void *key)
709  {
710  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
711  	struct stack_map_bucket *old_bucket;
712  	u32 id = *(u32 *)key;
713  
714  	if (unlikely(id >= smap->n_buckets))
715  		return -E2BIG;
716  
717  	old_bucket = xchg(&smap->buckets[id], NULL);
718  	if (old_bucket) {
719  		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
720  		return 0;
721  	} else {
722  		return -ENOENT;
723  	}
724  }
725  
726  /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
stack_map_free(struct bpf_map * map)727  static void stack_map_free(struct bpf_map *map)
728  {
729  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
730  
731  	bpf_map_area_free(smap->elems);
732  	pcpu_freelist_destroy(&smap->freelist);
733  	bpf_map_area_free(smap);
734  	put_callchain_buffers();
735  }
736  
stack_map_mem_usage(const struct bpf_map * map)737  static u64 stack_map_mem_usage(const struct bpf_map *map)
738  {
739  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
740  	u64 value_size = map->value_size;
741  	u64 n_buckets = smap->n_buckets;
742  	u64 enties = map->max_entries;
743  	u64 usage = sizeof(*smap);
744  
745  	usage += n_buckets * sizeof(struct stack_map_bucket *);
746  	usage += enties * (sizeof(struct stack_map_bucket) + value_size);
747  	return usage;
748  }
749  
750  BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
751  const struct bpf_map_ops stack_trace_map_ops = {
752  	.map_meta_equal = bpf_map_meta_equal,
753  	.map_alloc = stack_map_alloc,
754  	.map_free = stack_map_free,
755  	.map_get_next_key = stack_map_get_next_key,
756  	.map_lookup_elem = stack_map_lookup_elem,
757  	.map_update_elem = stack_map_update_elem,
758  	.map_delete_elem = stack_map_delete_elem,
759  	.map_check_btf = map_check_no_btf,
760  	.map_mem_usage = stack_map_mem_usage,
761  	.map_btf_id = &stack_trace_map_btf_ids[0],
762  };
763