1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright IBM Corp. 2018
4 * Auxtrace support for s390 CPU-Measurement Sampling Facility
5 *
6 * Author(s):  Thomas Richter <tmricht@linux.ibm.com>
7 *
8 * Auxiliary traces are collected during 'perf record' using rbd000 event.
9 * Several PERF_RECORD_XXX are generated during recording:
10 *
11 * PERF_RECORD_AUX:
12 *	Records that new data landed in the AUX buffer part.
13 * PERF_RECORD_AUXTRACE:
14 *	Defines auxtrace data. Followed by the actual data. The contents of
15 *	the auxtrace data is dependent on the event and the CPU.
16 *	This record is generated by perf record command. For details
17 *	see Documentation/perf.data-file-format.txt.
18 * PERF_RECORD_AUXTRACE_INFO:
19 *	Defines a table of contains for PERF_RECORD_AUXTRACE records. This
20 *	record is generated during 'perf record' command. Each record contains
21 *	up to 256 entries describing offset and size of the AUXTRACE data in the
22 *	perf.data file.
23 * PERF_RECORD_AUXTRACE_ERROR:
24 *	Indicates an error during AUXTRACE collection such as buffer overflow.
25 * PERF_RECORD_FINISHED_ROUND:
26 *	Perf events are not necessarily in time stamp order, as they can be
27 *	collected in parallel on different CPUs. If the events should be
28 *	processed in time order they need to be sorted first.
29 *	Perf report guarantees that there is no reordering over a
30 *	PERF_RECORD_FINISHED_ROUND boundary event. All perf records with a
31 *	time stamp lower than this record are processed (and displayed) before
32 *	the succeeding perf record are processed.
33 *
34 * These records are evaluated during perf report command.
35 *
36 * 1. PERF_RECORD_AUXTRACE_INFO is used to set up the infrastructure for
37 * auxiliary trace data processing. See s390_cpumsf_process_auxtrace_info()
38 * below.
39 * Auxiliary trace data is collected per CPU. To merge the data into the report
40 * an auxtrace_queue is created for each CPU. It is assumed that the auxtrace
41 * data is in ascending order.
42 *
43 * Each queue has a double linked list of auxtrace_buffers. This list contains
44 * the offset and size of a CPU's auxtrace data. During auxtrace processing
45 * the data portion is mmap()'ed.
46 *
47 * To sort the queues in chronological order, all queue access is controlled
48 * by the auxtrace_heap. This is basically a stack, each stack element has two
49 * entries, the queue number and a time stamp. However the stack is sorted by
50 * the time stamps. The highest time stamp is at the bottom the lowest
51 * (nearest) time stamp is at the top. That sort order is maintained at all
52 * times!
53 *
54 * After the auxtrace infrastructure has been setup, the auxtrace queues are
55 * filled with data (offset/size pairs) and the auxtrace_heap is populated.
56 *
57 * 2. PERF_RECORD_XXX processing triggers access to the auxtrace_queues.
58 * Each record is handled by s390_cpumsf_process_event(). The time stamp of
59 * the perf record is compared with the time stamp located on the auxtrace_heap
60 * top element. If that time stamp is lower than the time stamp from the
61 * record sample, the auxtrace queues will be processed. As auxtrace queues
62 * control many auxtrace_buffers and each buffer can be quite large, the
63 * auxtrace buffer might be processed only partially. In this case the
64 * position in the auxtrace_buffer of that queue is remembered and the time
65 * stamp of the last processed entry of the auxtrace_buffer replaces the
66 * current auxtrace_heap top.
67 *
68 * 3. Auxtrace_queues might run of out data and are fed by the
69 * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
70 *
71 * Event Generation
72 * Each sampling-data entry in the auxiliary trace data generates a perf sample.
73 * This sample is filled
74 * with data from the auxtrace such as PID/TID, instruction address, CPU state,
75 * etc. This sample is processed with perf_session__deliver_synth_event() to
76 * be included into the GUI.
77 *
78 * 4. PERF_RECORD_FINISHED_ROUND event is used to process all the remaining
79 * auxiliary traces entries until the time stamp of this record is reached
80 * auxtrace_heap top. This is triggered by ordered_event->deliver().
81 *
82 *
83 * Perf event processing.
84 * Event processing of PERF_RECORD_XXX entries relies on time stamp entries.
85 * This is the function call sequence:
86 *
87 * __cmd_report()
88 * |
89 * perf_session__process_events()
90 * |
91 * __perf_session__process_events()
92 * |
93 * perf_session__process_event()
94 * |  This functions splits the PERF_RECORD_XXX records.
95 * |  - Those generated by perf record command (type number equal or higher
96 * |    than PERF_RECORD_USER_TYPE_START) are handled by
97 * |    perf_session__process_user_event(see below)
98 * |  - Those generated by the kernel are handled by
99 * |    evlist__parse_sample_timestamp()
100 * |
101 * evlist__parse_sample_timestamp()
102 * |  Extract time stamp from sample data.
103 * |
104 * perf_session__queue_event()
105 * |  If timestamp is positive the sample is entered into an ordered_event
106 * |  list, sort order is the timestamp. The event processing is deferred until
107 * |  later (see perf_session__process_user_event()).
108 * |  Other timestamps (0 or -1) are handled immediately by
109 * |  perf_session__deliver_event(). These are events generated at start up
110 * |  of command perf record. They create PERF_RECORD_COMM and PERF_RECORD_MMAP*
111 * |  records. They are needed to create a list of running processes and its
112 * |  memory mappings and layout. They are needed at the beginning to enable
113 * |  command perf report to create process trees and memory mappings.
114 * |
115 * perf_session__deliver_event()
116 * |  Delivers a PERF_RECORD_XXX entry for handling.
117 * |
118 * auxtrace__process_event()
119 * |  The timestamp of the PERF_RECORD_XXX entry is taken to correlate with
120 * |  time stamps from the auxiliary trace buffers. This enables
121 * |  synchronization between auxiliary trace data and the events on the
122 * |  perf.data file.
123 * |
124 * machine__deliver_event()
125 * |  Handles the PERF_RECORD_XXX event. This depends on the record type.
126 *    It might update the process tree, update a process memory map or enter
127 *    a sample with IP and call back chain data into GUI data pool.
128 *
129 *
130 * Deferred processing determined by perf_session__process_user_event() is
131 * finally processed when a PERF_RECORD_FINISHED_ROUND is encountered. These
132 * are generated during command perf record.
133 * The timestamp of PERF_RECORD_FINISHED_ROUND event is taken to process all
134 * PERF_RECORD_XXX entries stored in the ordered_event list. This list was
135 * built up while reading the perf.data file.
136 * Each event is now processed by calling perf_session__deliver_event().
137 * This enables time synchronization between the data in the perf.data file and
138 * the data in the auxiliary trace buffers.
139 */
140
141#include <endian.h>
142#include <errno.h>
143#include <byteswap.h>
144#include <inttypes.h>
145#include <linux/kernel.h>
146#include <linux/types.h>
147#include <linux/bitops.h>
148#include <linux/log2.h>
149#include <linux/zalloc.h>
150
151#include <sys/stat.h>
152#include <sys/types.h>
153
154#include "color.h"
155#include "evsel.h"
156#include "evlist.h"
157#include "machine.h"
158#include "session.h"
159#include "tool.h"
160#include "debug.h"
161#include "auxtrace.h"
162#include "s390-cpumsf.h"
163#include "s390-cpumsf-kernel.h"
164#include "s390-cpumcf-kernel.h"
165#include "config.h"
166#include "util/sample.h"
167
168struct s390_cpumsf {
169	struct auxtrace		auxtrace;
170	struct auxtrace_queues	queues;
171	struct auxtrace_heap	heap;
172	struct perf_session	*session;
173	struct machine		*machine;
174	u32			auxtrace_type;
175	u32			pmu_type;
176	u16			machine_type;
177	bool			data_queued;
178	bool			use_logfile;
179	char			*logdir;
180};
181
182struct s390_cpumsf_queue {
183	struct s390_cpumsf	*sf;
184	unsigned int		queue_nr;
185	struct auxtrace_buffer	*buffer;
186	int			cpu;
187	FILE			*logfile;
188	FILE			*logfile_ctr;
189};
190
191/* Check if the raw data should be dumped to file. If this is the case and
192 * the file to dump to has not been opened for writing, do so.
193 *
194 * Return 0 on success and greater zero on error so processing continues.
195 */
196static int s390_cpumcf_dumpctr(struct s390_cpumsf *sf,
197			       struct perf_sample *sample)
198{
199	struct s390_cpumsf_queue *sfq;
200	struct auxtrace_queue *q;
201	int rc = 0;
202
203	if (!sf->use_logfile || sf->queues.nr_queues <= sample->cpu)
204		return rc;
205
206	q = &sf->queues.queue_array[sample->cpu];
207	sfq = q->priv;
208	if (!sfq)		/* Queue not yet allocated */
209		return rc;
210
211	if (!sfq->logfile_ctr) {
212		char *name;
213
214		rc = (sf->logdir)
215			? asprintf(&name, "%s/aux.ctr.%02x",
216				 sf->logdir, sample->cpu)
217			: asprintf(&name, "aux.ctr.%02x", sample->cpu);
218		if (rc > 0)
219			sfq->logfile_ctr = fopen(name, "w");
220		if (sfq->logfile_ctr == NULL) {
221			pr_err("Failed to open counter set log file %s, "
222			       "continue...\n", name);
223			rc = 1;
224		}
225		free(name);
226	}
227
228	if (sfq->logfile_ctr) {
229		/* See comment above for -4 */
230		size_t n = fwrite(sample->raw_data, sample->raw_size - 4, 1,
231				  sfq->logfile_ctr);
232		if (n != 1) {
233			pr_err("Failed to write counter set data\n");
234			rc = 1;
235		}
236	}
237	return rc;
238}
239
240/* Display s390 CPU measurement facility basic-sampling data entry
241 * Data written on s390 in big endian byte order and contains bit
242 * fields across byte boundaries.
243 */
244static bool s390_cpumsf_basic_show(const char *color, size_t pos,
245				   struct hws_basic_entry *basicp)
246{
247	struct hws_basic_entry *basic = basicp;
248#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
249	struct hws_basic_entry local;
250	unsigned long long word = be64toh(*(unsigned long long *)basicp);
251
252	memset(&local, 0, sizeof(local));
253	local.def = be16toh(basicp->def);
254	local.prim_asn = word & 0xffff;
255	local.CL = word >> 30 & 0x3;
256	local.I = word >> 32 & 0x1;
257	local.AS = word >> 33 & 0x3;
258	local.P = word >> 35 & 0x1;
259	local.W = word >> 36 & 0x1;
260	local.T = word >> 37 & 0x1;
261	local.U = word >> 40 & 0xf;
262	local.ia = be64toh(basicp->ia);
263	local.gpp = be64toh(basicp->gpp);
264	local.hpp = be64toh(basicp->hpp);
265	basic = &local;
266#endif
267	if (basic->def != 1) {
268		pr_err("Invalid AUX trace basic entry [%#08zx]\n", pos);
269		return false;
270	}
271	color_fprintf(stdout, color, "    [%#08zx] Basic   Def:%04x Inst:%#04x"
272		      " %c%c%c%c AS:%d ASN:%#04x IA:%#018llx\n"
273		      "\t\tCL:%d HPP:%#018llx GPP:%#018llx\n",
274		      pos, basic->def, basic->U,
275		      basic->T ? 'T' : ' ',
276		      basic->W ? 'W' : ' ',
277		      basic->P ? 'P' : ' ',
278		      basic->I ? 'I' : ' ',
279		      basic->AS, basic->prim_asn, basic->ia, basic->CL,
280		      basic->hpp, basic->gpp);
281	return true;
282}
283
284/* Display s390 CPU measurement facility diagnostic-sampling data entry.
285 * Data written on s390 in big endian byte order and contains bit
286 * fields across byte boundaries.
287 */
288static bool s390_cpumsf_diag_show(const char *color, size_t pos,
289				  struct hws_diag_entry *diagp)
290{
291	struct hws_diag_entry *diag = diagp;
292#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
293	struct hws_diag_entry local;
294	unsigned long long word = be64toh(*(unsigned long long *)diagp);
295
296	local.def = be16toh(diagp->def);
297	local.I = word >> 32 & 0x1;
298	diag = &local;
299#endif
300	if (diag->def < S390_CPUMSF_DIAG_DEF_FIRST) {
301		pr_err("Invalid AUX trace diagnostic entry [%#08zx]\n", pos);
302		return false;
303	}
304	color_fprintf(stdout, color, "    [%#08zx] Diag    Def:%04x %c\n",
305		      pos, diag->def, diag->I ? 'I' : ' ');
306	return true;
307}
308
309/* Return TOD timestamp contained in an trailer entry */
310static unsigned long long trailer_timestamp(struct hws_trailer_entry *te,
311					    int idx)
312{
313	/* te->t set: TOD in STCKE format, bytes 8-15
314	 * to->t not set: TOD in STCK format, bytes 0-7
315	 */
316	unsigned long long ts;
317
318	memcpy(&ts, &te->timestamp[idx], sizeof(ts));
319	return be64toh(ts);
320}
321
322/* Display s390 CPU measurement facility trailer entry */
323static bool s390_cpumsf_trailer_show(const char *color, size_t pos,
324				     struct hws_trailer_entry *te)
325{
326#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
327	struct hws_trailer_entry local;
328	const unsigned long long flags = be64toh(te->flags);
329
330	memset(&local, 0, sizeof(local));
331	local.f = flags >> 63 & 0x1;
332	local.a = flags >> 62 & 0x1;
333	local.t = flags >> 61 & 0x1;
334	local.bsdes = be16toh((flags >> 16 & 0xffff));
335	local.dsdes = be16toh((flags & 0xffff));
336	memcpy(&local.timestamp, te->timestamp, sizeof(te->timestamp));
337	local.overflow = be64toh(te->overflow);
338	local.clock_base = be64toh(te->progusage[0]) >> 63 & 1;
339	local.progusage2 = be64toh(te->progusage2);
340	te = &local;
341#endif
342	if (te->bsdes != sizeof(struct hws_basic_entry)) {
343		pr_err("Invalid AUX trace trailer entry [%#08zx]\n", pos);
344		return false;
345	}
346	color_fprintf(stdout, color, "    [%#08zx] Trailer %c%c%c bsdes:%d"
347		      " dsdes:%d Overflow:%lld Time:%#llx\n"
348		      "\t\tC:%d TOD:%#lx\n",
349		      pos,
350		      te->f ? 'F' : ' ',
351		      te->a ? 'A' : ' ',
352		      te->t ? 'T' : ' ',
353		      te->bsdes, te->dsdes, te->overflow,
354		      trailer_timestamp(te, te->clock_base),
355		      te->clock_base, te->progusage2);
356	return true;
357}
358
359/* Test a sample data block. It must be 4KB or a multiple thereof in size and
360 * 4KB page aligned. Each sample data page has a trailer entry at the
361 * end which contains the sample entry data sizes.
362 *
363 * Return true if the sample data block passes the checks and set the
364 * basic set entry size and diagnostic set entry size.
365 *
366 * Return false on failure.
367 *
368 * Note: Old hardware does not set the basic or diagnostic entry sizes
369 * in the trailer entry. Use the type number instead.
370 */
371static bool s390_cpumsf_validate(int machine_type,
372				 unsigned char *buf, size_t len,
373				 unsigned short *bsdes,
374				 unsigned short *dsdes)
375{
376	struct hws_basic_entry *basic = (struct hws_basic_entry *)buf;
377	struct hws_trailer_entry *te;
378
379	*dsdes = *bsdes = 0;
380	if (len & (S390_CPUMSF_PAGESZ - 1))	/* Illegal size */
381		return false;
382	if (be16toh(basic->def) != 1)	/* No basic set entry, must be first */
383		return false;
384	/* Check for trailer entry at end of SDB */
385	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
386					      - sizeof(*te));
387	*bsdes = be16toh(te->bsdes);
388	*dsdes = be16toh(te->dsdes);
389	if (!te->bsdes && !te->dsdes) {
390		/* Very old hardware, use CPUID */
391		switch (machine_type) {
392		case 2097:
393		case 2098:
394			*dsdes = 64;
395			*bsdes = 32;
396			break;
397		case 2817:
398		case 2818:
399			*dsdes = 74;
400			*bsdes = 32;
401			break;
402		case 2827:
403		case 2828:
404			*dsdes = 85;
405			*bsdes = 32;
406			break;
407		case 2964:
408		case 2965:
409			*dsdes = 112;
410			*bsdes = 32;
411			break;
412		default:
413			/* Illegal trailer entry */
414			return false;
415		}
416	}
417	return true;
418}
419
420/* Return true if there is room for another entry */
421static bool s390_cpumsf_reached_trailer(size_t entry_sz, size_t pos)
422{
423	size_t payload = S390_CPUMSF_PAGESZ - sizeof(struct hws_trailer_entry);
424
425	if (payload - (pos & (S390_CPUMSF_PAGESZ - 1)) < entry_sz)
426		return false;
427	return true;
428}
429
430/* Dump an auxiliary buffer. These buffers are multiple of
431 * 4KB SDB pages.
432 */
433static void s390_cpumsf_dump(struct s390_cpumsf *sf,
434			     unsigned char *buf, size_t len)
435{
436	const char *color = PERF_COLOR_BLUE;
437	struct hws_basic_entry *basic;
438	struct hws_diag_entry *diag;
439	unsigned short bsdes, dsdes;
440	size_t pos = 0;
441
442	color_fprintf(stdout, color,
443		      ". ... s390 AUX data: size %zu bytes\n",
444		      len);
445
446	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
447				  &dsdes)) {
448		pr_err("Invalid AUX trace data block size:%zu"
449		       " (type:%d bsdes:%hd dsdes:%hd)\n",
450		       len, sf->machine_type, bsdes, dsdes);
451		return;
452	}
453
454	/* s390 kernel always returns 4KB blocks fully occupied,
455	 * no partially filled SDBs.
456	 */
457	while (pos < len) {
458		/* Handle Basic entry */
459		basic = (struct hws_basic_entry *)(buf + pos);
460		if (s390_cpumsf_basic_show(color, pos, basic))
461			pos += bsdes;
462		else
463			return;
464
465		/* Handle Diagnostic entry */
466		diag = (struct hws_diag_entry *)(buf + pos);
467		if (s390_cpumsf_diag_show(color, pos, diag))
468			pos += dsdes;
469		else
470			return;
471
472		/* Check for trailer entry */
473		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
474			/* Show trailer entry */
475			struct hws_trailer_entry te;
476
477			pos = (pos + S390_CPUMSF_PAGESZ)
478			       & ~(S390_CPUMSF_PAGESZ - 1);
479			pos -= sizeof(te);
480			memcpy(&te, buf + pos, sizeof(te));
481			/* Set descriptor sizes in case of old hardware
482			 * where these values are not set.
483			 */
484			te.bsdes = bsdes;
485			te.dsdes = dsdes;
486			if (s390_cpumsf_trailer_show(color, pos, &te))
487				pos += sizeof(te);
488			else
489				return;
490		}
491	}
492}
493
494static void s390_cpumsf_dump_event(struct s390_cpumsf *sf, unsigned char *buf,
495				   size_t len)
496{
497	printf(".\n");
498	s390_cpumsf_dump(sf, buf, len);
499}
500
501#define	S390_LPP_PID_MASK	0xffffffff
502
503static bool s390_cpumsf_make_event(size_t pos,
504				   struct hws_basic_entry *basic,
505				   struct s390_cpumsf_queue *sfq)
506{
507	struct perf_sample sample = {
508				.ip = basic->ia,
509				.pid = basic->hpp & S390_LPP_PID_MASK,
510				.tid = basic->hpp & S390_LPP_PID_MASK,
511				.cpumode = PERF_RECORD_MISC_CPUMODE_UNKNOWN,
512				.cpu = sfq->cpu,
513				.period = 1
514			    };
515	union perf_event event;
516
517	memset(&event, 0, sizeof(event));
518	if (basic->CL == 1)	/* Native LPAR mode */
519		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
520					  : PERF_RECORD_MISC_KERNEL;
521	else if (basic->CL == 2)	/* Guest kernel/user space */
522		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
523					  : PERF_RECORD_MISC_GUEST_KERNEL;
524	else if (basic->gpp || basic->prim_asn != 0xffff)
525		/* Use heuristics on old hardware */
526		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
527					  : PERF_RECORD_MISC_GUEST_KERNEL;
528	else
529		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
530					  : PERF_RECORD_MISC_KERNEL;
531
532	event.sample.header.type = PERF_RECORD_SAMPLE;
533	event.sample.header.misc = sample.cpumode;
534	event.sample.header.size = sizeof(struct perf_event_header);
535
536	pr_debug4("%s pos:%#zx ip:%#" PRIx64 " P:%d CL:%d pid:%d.%d cpumode:%d cpu:%d\n",
537		 __func__, pos, sample.ip, basic->P, basic->CL, sample.pid,
538		 sample.tid, sample.cpumode, sample.cpu);
539	if (perf_session__deliver_synth_event(sfq->sf->session, &event,
540					      &sample)) {
541		pr_err("s390 Auxiliary Trace: failed to deliver event\n");
542		return false;
543	}
544	return true;
545}
546
547static unsigned long long get_trailer_time(const unsigned char *buf)
548{
549	struct hws_trailer_entry *te;
550	unsigned long long aux_time, progusage2;
551	bool clock_base;
552
553	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
554					      - sizeof(*te));
555
556#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
557	clock_base = be64toh(te->progusage[0]) >> 63 & 0x1;
558	progusage2 = be64toh(te->progusage[1]);
559#else
560	clock_base = te->clock_base;
561	progusage2 = te->progusage2;
562#endif
563	if (!clock_base)	/* TOD_CLOCK_BASE value missing */
564		return 0;
565
566	/* Correct calculation to convert time stamp in trailer entry to
567	 * nano seconds (taken from arch/s390 function tod_to_ns()).
568	 * TOD_CLOCK_BASE is stored in trailer entry member progusage2.
569	 */
570	aux_time = trailer_timestamp(te, clock_base) - progusage2;
571	aux_time = (aux_time >> 9) * 125 + (((aux_time & 0x1ff) * 125) >> 9);
572	return aux_time;
573}
574
575/* Process the data samples of a single queue. The first parameter is a
576 * pointer to the queue, the second parameter is the time stamp. This
577 * is the time stamp:
578 * - of the event that triggered this processing.
579 * - or the time stamp when the last processing of this queue stopped.
580 *   In this case it stopped at a 4KB page boundary and record the
581 *   position on where to continue processing on the next invocation
582 *   (see buffer->use_data and buffer->use_size).
583 *
584 * When this function returns the second parameter is updated to
585 * reflect the time stamp of the last processed auxiliary data entry
586 * (taken from the trailer entry of that page). The caller uses this
587 * returned time stamp to record the last processed entry in this
588 * queue.
589 *
590 * The function returns:
591 * 0:  Processing successful. The second parameter returns the
592 *     time stamp from the trailer entry until which position
593 *     processing took place. Subsequent calls resume from this
594 *     position.
595 * <0: An error occurred during processing. The second parameter
596 *     returns the maximum time stamp.
597 * >0: Done on this queue. The second parameter returns the
598 *     maximum time stamp.
599 */
600static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
601{
602	struct s390_cpumsf *sf = sfq->sf;
603	unsigned char *buf = sfq->buffer->use_data;
604	size_t len = sfq->buffer->use_size;
605	struct hws_basic_entry *basic;
606	unsigned short bsdes, dsdes;
607	size_t pos = 0;
608	int err = 1;
609	u64 aux_ts;
610
611	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
612				  &dsdes)) {
613		*ts = ~0ULL;
614		return -1;
615	}
616
617	/* Get trailer entry time stamp and check if entries in
618	 * this auxiliary page are ready for processing. If the
619	 * time stamp of the first entry is too high, whole buffer
620	 * can be skipped. In this case return time stamp.
621	 */
622	aux_ts = get_trailer_time(buf);
623	if (!aux_ts) {
624		pr_err("[%#08" PRIx64 "] Invalid AUX trailer entry TOD clock base\n",
625		       (s64)sfq->buffer->data_offset);
626		aux_ts = ~0ULL;
627		goto out;
628	}
629	if (aux_ts > *ts) {
630		*ts = aux_ts;
631		return 0;
632	}
633
634	while (pos < len) {
635		/* Handle Basic entry */
636		basic = (struct hws_basic_entry *)(buf + pos);
637		if (s390_cpumsf_make_event(pos, basic, sfq))
638			pos += bsdes;
639		else {
640			err = -EBADF;
641			goto out;
642		}
643
644		pos += dsdes;	/* Skip diagnostic entry */
645
646		/* Check for trailer entry */
647		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
648			pos = (pos + S390_CPUMSF_PAGESZ)
649			       & ~(S390_CPUMSF_PAGESZ - 1);
650			/* Check existence of next page */
651			if (pos >= len)
652				break;
653			aux_ts = get_trailer_time(buf + pos);
654			if (!aux_ts) {
655				aux_ts = ~0ULL;
656				goto out;
657			}
658			if (aux_ts > *ts) {
659				*ts = aux_ts;
660				sfq->buffer->use_data += pos;
661				sfq->buffer->use_size -= pos;
662				return 0;
663			}
664		}
665	}
666out:
667	*ts = aux_ts;
668	sfq->buffer->use_size = 0;
669	sfq->buffer->use_data = NULL;
670	return err;	/* Buffer completely scanned or error */
671}
672
673/* Run the s390 auxiliary trace decoder.
674 * Select the queue buffer to operate on, the caller already selected
675 * the proper queue, depending on second parameter 'ts'.
676 * This is the time stamp until which the auxiliary entries should
677 * be processed. This value is updated by called functions and
678 * returned to the caller.
679 *
680 * Resume processing in the current buffer. If there is no buffer
681 * get a new buffer from the queue and setup start position for
682 * processing.
683 * When a buffer is completely processed remove it from the queue
684 * before returning.
685 *
686 * This function returns
687 * 1: When the queue is empty. Second parameter will be set to
688 *    maximum time stamp.
689 * 0: Normal processing done.
690 * <0: Error during queue buffer setup. This causes the caller
691 *     to stop processing completely.
692 */
693static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
694				   u64 *ts)
695{
696
697	struct auxtrace_buffer *buffer;
698	struct auxtrace_queue *queue;
699	int err;
700
701	queue = &sfq->sf->queues.queue_array[sfq->queue_nr];
702
703	/* Get buffer and last position in buffer to resume
704	 * decoding the auxiliary entries. One buffer might be large
705	 * and decoding might stop in between. This depends on the time
706	 * stamp of the trailer entry in each page of the auxiliary
707	 * data and the time stamp of the event triggering the decoding.
708	 */
709	if (sfq->buffer == NULL) {
710		sfq->buffer = buffer = auxtrace_buffer__next(queue,
711							     sfq->buffer);
712		if (!buffer) {
713			*ts = ~0ULL;
714			return 1;	/* Processing done on this queue */
715		}
716		/* Start with a new buffer on this queue */
717		if (buffer->data) {
718			buffer->use_size = buffer->size;
719			buffer->use_data = buffer->data;
720		}
721		if (sfq->logfile) {	/* Write into log file */
722			size_t rc = fwrite(buffer->data, buffer->size, 1,
723					   sfq->logfile);
724			if (rc != 1)
725				pr_err("Failed to write auxiliary data\n");
726		}
727	} else
728		buffer = sfq->buffer;
729
730	if (!buffer->data) {
731		int fd = perf_data__fd(sfq->sf->session->data);
732
733		buffer->data = auxtrace_buffer__get_data(buffer, fd);
734		if (!buffer->data)
735			return -ENOMEM;
736		buffer->use_size = buffer->size;
737		buffer->use_data = buffer->data;
738
739		if (sfq->logfile) {	/* Write into log file */
740			size_t rc = fwrite(buffer->data, buffer->size, 1,
741					   sfq->logfile);
742			if (rc != 1)
743				pr_err("Failed to write auxiliary data\n");
744		}
745	}
746	pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n",
747		  __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset,
748		  buffer->size, buffer->use_size);
749	err = s390_cpumsf_samples(sfq, ts);
750
751	/* If non-zero, there is either an error (err < 0) or the buffer is
752	 * completely done (err > 0). The error is unrecoverable, usually
753	 * some descriptors could not be read successfully, so continue with
754	 * the next buffer.
755	 * In both cases the parameter 'ts' has been updated.
756	 */
757	if (err) {
758		sfq->buffer = NULL;
759		list_del_init(&buffer->list);
760		auxtrace_buffer__free(buffer);
761		if (err > 0)		/* Buffer done, no error */
762			err = 0;
763	}
764	return err;
765}
766
767static struct s390_cpumsf_queue *
768s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr)
769{
770	struct s390_cpumsf_queue *sfq;
771
772	sfq = zalloc(sizeof(struct s390_cpumsf_queue));
773	if (sfq == NULL)
774		return NULL;
775
776	sfq->sf = sf;
777	sfq->queue_nr = queue_nr;
778	sfq->cpu = -1;
779	if (sf->use_logfile) {
780		char *name;
781		int rc;
782
783		rc = (sf->logdir)
784			? asprintf(&name, "%s/aux.smp.%02x",
785				 sf->logdir, queue_nr)
786			: asprintf(&name, "aux.smp.%02x", queue_nr);
787		if (rc > 0)
788			sfq->logfile = fopen(name, "w");
789		if (sfq->logfile == NULL) {
790			pr_err("Failed to open auxiliary log file %s,"
791			       "continue...\n", name);
792			sf->use_logfile = false;
793		}
794		free(name);
795	}
796	return sfq;
797}
798
799static int s390_cpumsf_setup_queue(struct s390_cpumsf *sf,
800				   struct auxtrace_queue *queue,
801				   unsigned int queue_nr, u64 ts)
802{
803	struct s390_cpumsf_queue *sfq = queue->priv;
804
805	if (list_empty(&queue->head))
806		return 0;
807
808	if (sfq == NULL) {
809		sfq = s390_cpumsf_alloc_queue(sf, queue_nr);
810		if (!sfq)
811			return -ENOMEM;
812		queue->priv = sfq;
813
814		if (queue->cpu != -1)
815			sfq->cpu = queue->cpu;
816	}
817	return auxtrace_heap__add(&sf->heap, queue_nr, ts);
818}
819
820static int s390_cpumsf_setup_queues(struct s390_cpumsf *sf, u64 ts)
821{
822	unsigned int i;
823	int ret = 0;
824
825	for (i = 0; i < sf->queues.nr_queues; i++) {
826		ret = s390_cpumsf_setup_queue(sf, &sf->queues.queue_array[i],
827					      i, ts);
828		if (ret)
829			break;
830	}
831	return ret;
832}
833
834static int s390_cpumsf_update_queues(struct s390_cpumsf *sf, u64 ts)
835{
836	if (!sf->queues.new_data)
837		return 0;
838
839	sf->queues.new_data = false;
840	return s390_cpumsf_setup_queues(sf, ts);
841}
842
843static int s390_cpumsf_process_queues(struct s390_cpumsf *sf, u64 timestamp)
844{
845	unsigned int queue_nr;
846	u64 ts;
847	int ret;
848
849	while (1) {
850		struct auxtrace_queue *queue;
851		struct s390_cpumsf_queue *sfq;
852
853		if (!sf->heap.heap_cnt)
854			return 0;
855
856		if (sf->heap.heap_array[0].ordinal >= timestamp)
857			return 0;
858
859		queue_nr = sf->heap.heap_array[0].queue_nr;
860		queue = &sf->queues.queue_array[queue_nr];
861		sfq = queue->priv;
862
863		auxtrace_heap__pop(&sf->heap);
864		if (sf->heap.heap_cnt) {
865			ts = sf->heap.heap_array[0].ordinal + 1;
866			if (ts > timestamp)
867				ts = timestamp;
868		} else {
869			ts = timestamp;
870		}
871
872		ret = s390_cpumsf_run_decoder(sfq, &ts);
873		if (ret < 0) {
874			auxtrace_heap__add(&sf->heap, queue_nr, ts);
875			return ret;
876		}
877		if (!ret) {
878			ret = auxtrace_heap__add(&sf->heap, queue_nr, ts);
879			if (ret < 0)
880				return ret;
881		}
882	}
883	return 0;
884}
885
886static int s390_cpumsf_synth_error(struct s390_cpumsf *sf, int code, int cpu,
887				   pid_t pid, pid_t tid, u64 ip, u64 timestamp)
888{
889	char msg[MAX_AUXTRACE_ERROR_MSG];
890	union perf_event event;
891	int err;
892
893	strncpy(msg, "Lost Auxiliary Trace Buffer", sizeof(msg) - 1);
894	auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
895			     code, cpu, pid, tid, ip, msg, timestamp);
896
897	err = perf_session__deliver_synth_event(sf->session, &event, NULL);
898	if (err)
899		pr_err("s390 Auxiliary Trace: failed to deliver error event,"
900			"error %d\n", err);
901	return err;
902}
903
904static int s390_cpumsf_lost(struct s390_cpumsf *sf, struct perf_sample *sample)
905{
906	return s390_cpumsf_synth_error(sf, 1, sample->cpu,
907				       sample->pid, sample->tid, 0,
908				       sample->time);
909}
910
911static int
912s390_cpumsf_process_event(struct perf_session *session,
913			  union perf_event *event,
914			  struct perf_sample *sample,
915			  struct perf_tool *tool)
916{
917	struct s390_cpumsf *sf = container_of(session->auxtrace,
918					      struct s390_cpumsf,
919					      auxtrace);
920	u64 timestamp = sample->time;
921	struct evsel *ev_bc000;
922
923	int err = 0;
924
925	if (dump_trace)
926		return 0;
927
928	if (!tool->ordered_events) {
929		pr_err("s390 Auxiliary Trace requires ordered events\n");
930		return -EINVAL;
931	}
932
933	if (event->header.type == PERF_RECORD_SAMPLE &&
934	    sample->raw_size) {
935		/* Handle event with raw data */
936		ev_bc000 = evlist__event2evsel(session->evlist, event);
937		if (ev_bc000 &&
938		    ev_bc000->core.attr.config == PERF_EVENT_CPUM_CF_DIAG)
939			err = s390_cpumcf_dumpctr(sf, sample);
940		return err;
941	}
942
943	if (event->header.type == PERF_RECORD_AUX &&
944	    event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
945		return s390_cpumsf_lost(sf, sample);
946
947	if (timestamp) {
948		err = s390_cpumsf_update_queues(sf, timestamp);
949		if (!err)
950			err = s390_cpumsf_process_queues(sf, timestamp);
951	}
952	return err;
953}
954
955struct s390_cpumsf_synth {
956	struct perf_tool cpumsf_tool;
957	struct perf_session *session;
958};
959
960static int
961s390_cpumsf_process_auxtrace_event(struct perf_session *session,
962				   union perf_event *event __maybe_unused,
963				   struct perf_tool *tool __maybe_unused)
964{
965	struct s390_cpumsf *sf = container_of(session->auxtrace,
966					      struct s390_cpumsf,
967					      auxtrace);
968
969	int fd = perf_data__fd(session->data);
970	struct auxtrace_buffer *buffer;
971	off_t data_offset;
972	int err;
973
974	if (sf->data_queued)
975		return 0;
976
977	if (perf_data__is_pipe(session->data)) {
978		data_offset = 0;
979	} else {
980		data_offset = lseek(fd, 0, SEEK_CUR);
981		if (data_offset == -1)
982			return -errno;
983	}
984
985	err = auxtrace_queues__add_event(&sf->queues, session, event,
986					 data_offset, &buffer);
987	if (err)
988		return err;
989
990	/* Dump here after copying piped trace out of the pipe */
991	if (dump_trace) {
992		if (auxtrace_buffer__get_data(buffer, fd)) {
993			s390_cpumsf_dump_event(sf, buffer->data,
994					       buffer->size);
995			auxtrace_buffer__put_data(buffer);
996		}
997	}
998	return 0;
999}
1000
1001static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused)
1002{
1003}
1004
1005static int s390_cpumsf_flush(struct perf_session *session __maybe_unused,
1006			     struct perf_tool *tool __maybe_unused)
1007{
1008	return 0;
1009}
1010
1011static void s390_cpumsf_free_queues(struct perf_session *session)
1012{
1013	struct s390_cpumsf *sf = container_of(session->auxtrace,
1014					      struct s390_cpumsf,
1015					      auxtrace);
1016	struct auxtrace_queues *queues = &sf->queues;
1017	unsigned int i;
1018
1019	for (i = 0; i < queues->nr_queues; i++) {
1020		struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *)
1021						queues->queue_array[i].priv;
1022
1023		if (sfq != NULL) {
1024			if (sfq->logfile) {
1025				fclose(sfq->logfile);
1026				sfq->logfile = NULL;
1027			}
1028			if (sfq->logfile_ctr) {
1029				fclose(sfq->logfile_ctr);
1030				sfq->logfile_ctr = NULL;
1031			}
1032		}
1033		zfree(&queues->queue_array[i].priv);
1034	}
1035	auxtrace_queues__free(queues);
1036}
1037
1038static void s390_cpumsf_free(struct perf_session *session)
1039{
1040	struct s390_cpumsf *sf = container_of(session->auxtrace,
1041					      struct s390_cpumsf,
1042					      auxtrace);
1043
1044	auxtrace_heap__free(&sf->heap);
1045	s390_cpumsf_free_queues(session);
1046	session->auxtrace = NULL;
1047	zfree(&sf->logdir);
1048	free(sf);
1049}
1050
1051static bool
1052s390_cpumsf_evsel_is_auxtrace(struct perf_session *session __maybe_unused,
1053			      struct evsel *evsel)
1054{
1055	return evsel->core.attr.type == PERF_TYPE_RAW &&
1056	       evsel->core.attr.config == PERF_EVENT_CPUM_SF_DIAG;
1057}
1058
1059static int s390_cpumsf_get_type(const char *cpuid)
1060{
1061	int ret, family = 0;
1062
1063	ret = sscanf(cpuid, "%*[^,],%u", &family);
1064	return (ret == 1) ? family : 0;
1065}
1066
1067/* Check itrace options set on perf report command.
1068 * Return true, if none are set or all options specified can be
1069 * handled on s390 (currently only option 'd' for logging.
1070 * Return false otherwise.
1071 */
1072static bool check_auxtrace_itrace(struct itrace_synth_opts *itops)
1073{
1074	bool ison = false;
1075
1076	if (!itops || !itops->set)
1077		return true;
1078	ison = itops->inject || itops->instructions || itops->branches ||
1079		itops->transactions || itops->ptwrites ||
1080		itops->pwr_events || itops->errors ||
1081		itops->dont_decode || itops->calls || itops->returns ||
1082		itops->callchain || itops->thread_stack ||
1083		itops->last_branch || itops->add_callchain ||
1084		itops->add_last_branch;
1085	if (!ison)
1086		return true;
1087	pr_err("Unsupported --itrace options specified\n");
1088	return false;
1089}
1090
1091/* Check for AUXTRACE dump directory if it is needed.
1092 * On failure print an error message but continue.
1093 * Return 0 on wrong keyword in config file and 1 otherwise.
1094 */
1095static int s390_cpumsf__config(const char *var, const char *value, void *cb)
1096{
1097	struct s390_cpumsf *sf = cb;
1098	struct stat stbuf;
1099	int rc;
1100
1101	if (strcmp(var, "auxtrace.dumpdir"))
1102		return 0;
1103	sf->logdir = strdup(value);
1104	if (sf->logdir == NULL) {
1105		pr_err("Failed to find auxtrace log directory %s,"
1106		       " continue with current directory...\n", value);
1107		return 1;
1108	}
1109	rc = stat(sf->logdir, &stbuf);
1110	if (rc == -1 || !S_ISDIR(stbuf.st_mode)) {
1111		pr_err("Missing auxtrace log directory %s,"
1112		       " continue with current directory...\n", value);
1113		zfree(&sf->logdir);
1114	}
1115	return 1;
1116}
1117
1118int s390_cpumsf_process_auxtrace_info(union perf_event *event,
1119				      struct perf_session *session)
1120{
1121	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1122	struct s390_cpumsf *sf;
1123	int err;
1124
1125	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info))
1126		return -EINVAL;
1127
1128	sf = zalloc(sizeof(struct s390_cpumsf));
1129	if (sf == NULL)
1130		return -ENOMEM;
1131
1132	if (!check_auxtrace_itrace(session->itrace_synth_opts)) {
1133		err = -EINVAL;
1134		goto err_free;
1135	}
1136	sf->use_logfile = session->itrace_synth_opts->log;
1137	if (sf->use_logfile)
1138		perf_config(s390_cpumsf__config, sf);
1139
1140	err = auxtrace_queues__init(&sf->queues);
1141	if (err)
1142		goto err_free;
1143
1144	sf->session = session;
1145	sf->machine = &session->machines.host; /* No kvm support */
1146	sf->auxtrace_type = auxtrace_info->type;
1147	sf->pmu_type = PERF_TYPE_RAW;
1148	sf->machine_type = s390_cpumsf_get_type(session->evlist->env->cpuid);
1149
1150	sf->auxtrace.process_event = s390_cpumsf_process_event;
1151	sf->auxtrace.process_auxtrace_event = s390_cpumsf_process_auxtrace_event;
1152	sf->auxtrace.flush_events = s390_cpumsf_flush;
1153	sf->auxtrace.free_events = s390_cpumsf_free_events;
1154	sf->auxtrace.free = s390_cpumsf_free;
1155	sf->auxtrace.evsel_is_auxtrace = s390_cpumsf_evsel_is_auxtrace;
1156	session->auxtrace = &sf->auxtrace;
1157
1158	if (dump_trace)
1159		return 0;
1160
1161	err = auxtrace_queues__process_index(&sf->queues, session);
1162	if (err)
1163		goto err_free_queues;
1164
1165	if (sf->queues.populated)
1166		sf->data_queued = true;
1167
1168	return 0;
1169
1170err_free_queues:
1171	auxtrace_queues__free(&sf->queues);
1172	session->auxtrace = NULL;
1173err_free:
1174	zfree(&sf->logdir);
1175	free(sf);
1176	return err;
1177}
1178