1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * trace event based perf event profiling/tracing
4   *
5   * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
6   * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
7   */
8  
9  #include <linux/module.h>
10  #include <linux/kprobes.h>
11  #include <linux/security.h>
12  #include "trace.h"
13  #include "trace_probe.h"
14  
15  static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
16  
17  /*
18   * Force it to be aligned to unsigned long to avoid misaligned accesses
19   * surprises
20   */
21  typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
22  	perf_trace_t;
23  
24  /* Count the events in use (per event id, not per instance) */
25  static int	total_ref_count;
26  
perf_trace_event_perm(struct trace_event_call * tp_event,struct perf_event * p_event)27  static int perf_trace_event_perm(struct trace_event_call *tp_event,
28  				 struct perf_event *p_event)
29  {
30  	int ret;
31  
32  	if (tp_event->perf_perm) {
33  		ret = tp_event->perf_perm(tp_event, p_event);
34  		if (ret)
35  			return ret;
36  	}
37  
38  	/*
39  	 * We checked and allowed to create parent,
40  	 * allow children without checking.
41  	 */
42  	if (p_event->parent)
43  		return 0;
44  
45  	/*
46  	 * It's ok to check current process (owner) permissions in here,
47  	 * because code below is called only via perf_event_open syscall.
48  	 */
49  
50  	/* The ftrace function trace is allowed only for root. */
51  	if (ftrace_event_is_function(tp_event)) {
52  		ret = perf_allow_tracepoint(&p_event->attr);
53  		if (ret)
54  			return ret;
55  
56  		if (!is_sampling_event(p_event))
57  			return 0;
58  
59  		/*
60  		 * We don't allow user space callchains for  function trace
61  		 * event, due to issues with page faults while tracing page
62  		 * fault handler and its overall trickiness nature.
63  		 */
64  		if (!p_event->attr.exclude_callchain_user)
65  			return -EINVAL;
66  
67  		/*
68  		 * Same reason to disable user stack dump as for user space
69  		 * callchains above.
70  		 */
71  		if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
72  			return -EINVAL;
73  	}
74  
75  	/* No tracing, just counting, so no obvious leak */
76  	if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
77  		return 0;
78  
79  	/* Some events are ok to be traced by non-root users... */
80  	if (p_event->attach_state == PERF_ATTACH_TASK) {
81  		if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
82  			return 0;
83  	}
84  
85  	/*
86  	 * ...otherwise raw tracepoint data can be a severe data leak,
87  	 * only allow root to have these.
88  	 */
89  	ret = perf_allow_tracepoint(&p_event->attr);
90  	if (ret)
91  		return ret;
92  
93  	return 0;
94  }
95  
perf_trace_event_reg(struct trace_event_call * tp_event,struct perf_event * p_event)96  static int perf_trace_event_reg(struct trace_event_call *tp_event,
97  				struct perf_event *p_event)
98  {
99  	struct hlist_head __percpu *list;
100  	int ret = -ENOMEM;
101  	int cpu;
102  
103  	p_event->tp_event = tp_event;
104  	if (tp_event->perf_refcount++ > 0)
105  		return 0;
106  
107  	list = alloc_percpu(struct hlist_head);
108  	if (!list)
109  		goto fail;
110  
111  	for_each_possible_cpu(cpu)
112  		INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
113  
114  	tp_event->perf_events = list;
115  
116  	if (!total_ref_count) {
117  		char __percpu *buf;
118  		int i;
119  
120  		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
121  			buf = (char __percpu *)alloc_percpu(perf_trace_t);
122  			if (!buf)
123  				goto fail;
124  
125  			perf_trace_buf[i] = buf;
126  		}
127  	}
128  
129  	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
130  	if (ret)
131  		goto fail;
132  
133  	total_ref_count++;
134  	return 0;
135  
136  fail:
137  	if (!total_ref_count) {
138  		int i;
139  
140  		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
141  			free_percpu(perf_trace_buf[i]);
142  			perf_trace_buf[i] = NULL;
143  		}
144  	}
145  
146  	if (!--tp_event->perf_refcount) {
147  		free_percpu(tp_event->perf_events);
148  		tp_event->perf_events = NULL;
149  	}
150  
151  	return ret;
152  }
153  
perf_trace_event_unreg(struct perf_event * p_event)154  static void perf_trace_event_unreg(struct perf_event *p_event)
155  {
156  	struct trace_event_call *tp_event = p_event->tp_event;
157  	int i;
158  
159  	if (--tp_event->perf_refcount > 0)
160  		return;
161  
162  	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
163  
164  	/*
165  	 * Ensure our callback won't be called anymore. The buffers
166  	 * will be freed after that.
167  	 */
168  	tracepoint_synchronize_unregister();
169  
170  	free_percpu(tp_event->perf_events);
171  	tp_event->perf_events = NULL;
172  
173  	if (!--total_ref_count) {
174  		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
175  			free_percpu(perf_trace_buf[i]);
176  			perf_trace_buf[i] = NULL;
177  		}
178  	}
179  }
180  
perf_trace_event_open(struct perf_event * p_event)181  static int perf_trace_event_open(struct perf_event *p_event)
182  {
183  	struct trace_event_call *tp_event = p_event->tp_event;
184  	return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
185  }
186  
perf_trace_event_close(struct perf_event * p_event)187  static void perf_trace_event_close(struct perf_event *p_event)
188  {
189  	struct trace_event_call *tp_event = p_event->tp_event;
190  	tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
191  }
192  
perf_trace_event_init(struct trace_event_call * tp_event,struct perf_event * p_event)193  static int perf_trace_event_init(struct trace_event_call *tp_event,
194  				 struct perf_event *p_event)
195  {
196  	int ret;
197  
198  	ret = perf_trace_event_perm(tp_event, p_event);
199  	if (ret)
200  		return ret;
201  
202  	ret = perf_trace_event_reg(tp_event, p_event);
203  	if (ret)
204  		return ret;
205  
206  	ret = perf_trace_event_open(p_event);
207  	if (ret) {
208  		perf_trace_event_unreg(p_event);
209  		return ret;
210  	}
211  
212  	return 0;
213  }
214  
perf_trace_init(struct perf_event * p_event)215  int perf_trace_init(struct perf_event *p_event)
216  {
217  	struct trace_event_call *tp_event;
218  	u64 event_id = p_event->attr.config;
219  	int ret = -EINVAL;
220  
221  	mutex_lock(&event_mutex);
222  	list_for_each_entry(tp_event, &ftrace_events, list) {
223  		if (tp_event->event.type == event_id &&
224  		    tp_event->class && tp_event->class->reg &&
225  		    trace_event_try_get_ref(tp_event)) {
226  			ret = perf_trace_event_init(tp_event, p_event);
227  			if (ret)
228  				trace_event_put_ref(tp_event);
229  			break;
230  		}
231  	}
232  	mutex_unlock(&event_mutex);
233  
234  	return ret;
235  }
236  
perf_trace_destroy(struct perf_event * p_event)237  void perf_trace_destroy(struct perf_event *p_event)
238  {
239  	mutex_lock(&event_mutex);
240  	perf_trace_event_close(p_event);
241  	perf_trace_event_unreg(p_event);
242  	trace_event_put_ref(p_event->tp_event);
243  	mutex_unlock(&event_mutex);
244  }
245  
246  #ifdef CONFIG_KPROBE_EVENTS
perf_kprobe_init(struct perf_event * p_event,bool is_retprobe)247  int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
248  {
249  	int ret;
250  	char *func = NULL;
251  	struct trace_event_call *tp_event;
252  
253  	if (p_event->attr.kprobe_func) {
254  		func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func),
255  				    KSYM_NAME_LEN);
256  		if (IS_ERR(func)) {
257  			ret = PTR_ERR(func);
258  			return (ret == -EINVAL) ? -E2BIG : ret;
259  		}
260  
261  		if (func[0] == '\0') {
262  			kfree(func);
263  			func = NULL;
264  		}
265  	}
266  
267  	tp_event = create_local_trace_kprobe(
268  		func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
269  		p_event->attr.probe_offset, is_retprobe);
270  	if (IS_ERR(tp_event)) {
271  		ret = PTR_ERR(tp_event);
272  		goto out;
273  	}
274  
275  	mutex_lock(&event_mutex);
276  	ret = perf_trace_event_init(tp_event, p_event);
277  	if (ret)
278  		destroy_local_trace_kprobe(tp_event);
279  	mutex_unlock(&event_mutex);
280  out:
281  	kfree(func);
282  	return ret;
283  }
284  
perf_kprobe_destroy(struct perf_event * p_event)285  void perf_kprobe_destroy(struct perf_event *p_event)
286  {
287  	mutex_lock(&event_mutex);
288  	perf_trace_event_close(p_event);
289  	perf_trace_event_unreg(p_event);
290  	trace_event_put_ref(p_event->tp_event);
291  	mutex_unlock(&event_mutex);
292  
293  	destroy_local_trace_kprobe(p_event->tp_event);
294  }
295  #endif /* CONFIG_KPROBE_EVENTS */
296  
297  #ifdef CONFIG_UPROBE_EVENTS
perf_uprobe_init(struct perf_event * p_event,unsigned long ref_ctr_offset,bool is_retprobe)298  int perf_uprobe_init(struct perf_event *p_event,
299  		     unsigned long ref_ctr_offset, bool is_retprobe)
300  {
301  	int ret;
302  	char *path = NULL;
303  	struct trace_event_call *tp_event;
304  
305  	if (!p_event->attr.uprobe_path)
306  		return -EINVAL;
307  
308  	path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path),
309  			    PATH_MAX);
310  	if (IS_ERR(path)) {
311  		ret = PTR_ERR(path);
312  		return (ret == -EINVAL) ? -E2BIG : ret;
313  	}
314  	if (path[0] == '\0') {
315  		ret = -EINVAL;
316  		goto out;
317  	}
318  
319  	tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset,
320  					     ref_ctr_offset, is_retprobe);
321  	if (IS_ERR(tp_event)) {
322  		ret = PTR_ERR(tp_event);
323  		goto out;
324  	}
325  
326  	/*
327  	 * local trace_uprobe need to hold event_mutex to call
328  	 * uprobe_buffer_enable() and uprobe_buffer_disable().
329  	 * event_mutex is not required for local trace_kprobes.
330  	 */
331  	mutex_lock(&event_mutex);
332  	ret = perf_trace_event_init(tp_event, p_event);
333  	if (ret)
334  		destroy_local_trace_uprobe(tp_event);
335  	mutex_unlock(&event_mutex);
336  out:
337  	kfree(path);
338  	return ret;
339  }
340  
perf_uprobe_destroy(struct perf_event * p_event)341  void perf_uprobe_destroy(struct perf_event *p_event)
342  {
343  	mutex_lock(&event_mutex);
344  	perf_trace_event_close(p_event);
345  	perf_trace_event_unreg(p_event);
346  	trace_event_put_ref(p_event->tp_event);
347  	mutex_unlock(&event_mutex);
348  	destroy_local_trace_uprobe(p_event->tp_event);
349  }
350  #endif /* CONFIG_UPROBE_EVENTS */
351  
perf_trace_add(struct perf_event * p_event,int flags)352  int perf_trace_add(struct perf_event *p_event, int flags)
353  {
354  	struct trace_event_call *tp_event = p_event->tp_event;
355  
356  	if (!(flags & PERF_EF_START))
357  		p_event->hw.state = PERF_HES_STOPPED;
358  
359  	/*
360  	 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
361  	 * and we need to take the default action of enqueueing our event on
362  	 * the right per-cpu hlist.
363  	 */
364  	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
365  		struct hlist_head __percpu *pcpu_list;
366  		struct hlist_head *list;
367  
368  		pcpu_list = tp_event->perf_events;
369  		if (WARN_ON_ONCE(!pcpu_list))
370  			return -EINVAL;
371  
372  		list = this_cpu_ptr(pcpu_list);
373  		hlist_add_head_rcu(&p_event->hlist_entry, list);
374  	}
375  
376  	return 0;
377  }
378  
perf_trace_del(struct perf_event * p_event,int flags)379  void perf_trace_del(struct perf_event *p_event, int flags)
380  {
381  	struct trace_event_call *tp_event = p_event->tp_event;
382  
383  	/*
384  	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
385  	 * and we need to take the default action of dequeueing our event from
386  	 * the right per-cpu hlist.
387  	 */
388  	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
389  		hlist_del_rcu(&p_event->hlist_entry);
390  }
391  
perf_trace_buf_alloc(int size,struct pt_regs ** regs,int * rctxp)392  void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
393  {
394  	char *raw_data;
395  	int rctx;
396  
397  	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
398  
399  	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
400  		      "perf buffer not large enough, wanted %d, have %d",
401  		      size, PERF_MAX_TRACE_SIZE))
402  		return NULL;
403  
404  	*rctxp = rctx = perf_swevent_get_recursion_context();
405  	if (rctx < 0)
406  		return NULL;
407  
408  	if (regs)
409  		*regs = this_cpu_ptr(&__perf_regs[rctx]);
410  	raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
411  
412  	/* zero the dead bytes from align to not leak stack to user */
413  	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
414  	return raw_data;
415  }
416  EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
417  NOKPROBE_SYMBOL(perf_trace_buf_alloc);
418  
perf_trace_buf_update(void * record,u16 type)419  void perf_trace_buf_update(void *record, u16 type)
420  {
421  	struct trace_entry *entry = record;
422  
423  	tracing_generic_entry_update(entry, type, tracing_gen_ctx());
424  }
425  NOKPROBE_SYMBOL(perf_trace_buf_update);
426  
427  #ifdef CONFIG_FUNCTION_TRACER
428  static void
perf_ftrace_function_call(unsigned long ip,unsigned long parent_ip,struct ftrace_ops * ops,struct ftrace_regs * fregs)429  perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
430  			  struct ftrace_ops *ops,  struct ftrace_regs *fregs)
431  {
432  	struct ftrace_entry *entry;
433  	struct perf_event *event;
434  	struct hlist_head head;
435  	struct pt_regs regs;
436  	int rctx;
437  	int bit;
438  
439  	if (!rcu_is_watching())
440  		return;
441  
442  	bit = ftrace_test_recursion_trylock(ip, parent_ip);
443  	if (bit < 0)
444  		return;
445  
446  	if ((unsigned long)ops->private != smp_processor_id())
447  		goto out;
448  
449  	event = container_of(ops, struct perf_event, ftrace_ops);
450  
451  	/*
452  	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
453  	 * the perf code does is hlist_for_each_entry_rcu(), so we can
454  	 * get away with simply setting the @head.first pointer in order
455  	 * to create a singular list.
456  	 */
457  	head.first = &event->hlist_entry;
458  
459  #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
460  		    sizeof(u64)) - sizeof(u32))
461  
462  	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
463  
464  	memset(&regs, 0, sizeof(regs));
465  	perf_fetch_caller_regs(&regs);
466  
467  	entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
468  	if (!entry)
469  		goto out;
470  
471  	entry->ip = ip;
472  	entry->parent_ip = parent_ip;
473  	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
474  			      1, &regs, &head, NULL);
475  
476  out:
477  	ftrace_test_recursion_unlock(bit);
478  #undef ENTRY_SIZE
479  }
480  
perf_ftrace_function_register(struct perf_event * event)481  static int perf_ftrace_function_register(struct perf_event *event)
482  {
483  	struct ftrace_ops *ops = &event->ftrace_ops;
484  
485  	ops->func    = perf_ftrace_function_call;
486  	ops->private = (void *)(unsigned long)nr_cpu_ids;
487  
488  	return register_ftrace_function(ops);
489  }
490  
perf_ftrace_function_unregister(struct perf_event * event)491  static int perf_ftrace_function_unregister(struct perf_event *event)
492  {
493  	struct ftrace_ops *ops = &event->ftrace_ops;
494  	int ret = unregister_ftrace_function(ops);
495  	ftrace_free_filter(ops);
496  	return ret;
497  }
498  
perf_ftrace_event_register(struct trace_event_call * call,enum trace_reg type,void * data)499  int perf_ftrace_event_register(struct trace_event_call *call,
500  			       enum trace_reg type, void *data)
501  {
502  	struct perf_event *event = data;
503  
504  	switch (type) {
505  	case TRACE_REG_REGISTER:
506  	case TRACE_REG_UNREGISTER:
507  		break;
508  	case TRACE_REG_PERF_REGISTER:
509  	case TRACE_REG_PERF_UNREGISTER:
510  		return 0;
511  	case TRACE_REG_PERF_OPEN:
512  		return perf_ftrace_function_register(data);
513  	case TRACE_REG_PERF_CLOSE:
514  		return perf_ftrace_function_unregister(data);
515  	case TRACE_REG_PERF_ADD:
516  		event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
517  		return 1;
518  	case TRACE_REG_PERF_DEL:
519  		event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
520  		return 1;
521  	}
522  
523  	return -EINVAL;
524  }
525  #endif /* CONFIG_FUNCTION_TRACER */
526