1/*-
2 * Copyright (c) 2016-2017 Microsoft Corp.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/10/sys/dev/hyperv/pcib/vmbus_pcib.c 337959 2018-08-17 06:31:30Z dim $");
29
30#ifdef NEW_PCIB
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/types.h>
35#include <sys/malloc.h>
36#include <sys/module.h>
37#include <sys/kernel.h>
38#include <sys/queue.h>
39#include <sys/lock.h>
40#include <sys/sx.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/bus.h>
44#include <sys/rman.h>
45#include <sys/mutex.h>
46#include <sys/errno.h>
47
48#include <vm/vm.h>
49#include <vm/vm_param.h>
50#include <vm/vm_kern.h>
51#include <vm/pmap.h>
52
53#include <machine/atomic.h>
54#include <machine/bus.h>
55#include <machine/frame.h>
56#include <machine/pci_cfgreg.h>
57#include <machine/resource.h>
58
59#include <sys/pciio.h>
60#include <dev/pci/pcireg.h>
61#include <dev/pci/pcivar.h>
62#include <dev/pci/pci_private.h>
63#include <dev/pci/pcib_private.h>
64#include "pcib_if.h"
65
66#include <machine/intr_machdep.h>
67#include <x86/apicreg.h>
68
69#include <dev/hyperv/include/hyperv.h>
70#include <dev/hyperv/include/hyperv_busdma.h>
71#include <dev/hyperv/include/vmbus_xact.h>
72#include <dev/hyperv/vmbus/vmbus_reg.h>
73#include <dev/hyperv/vmbus/vmbus_chanvar.h>
74
75#include "vmbus_if.h"
76
77#if __FreeBSD_version < 1100000
78typedef u_long rman_res_t;
79#define RM_MAX_END	(~(rman_res_t)0)
80#endif
81
82struct completion {
83	unsigned int done;
84	struct mtx lock;
85};
86
87static void
88init_completion(struct completion *c)
89{
90	memset(c, 0, sizeof(*c));
91	mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF);
92	c->done = 0;
93}
94
95static void
96free_completion(struct completion *c)
97{
98	mtx_destroy(&c->lock);
99}
100
101static void
102complete(struct completion *c)
103{
104	mtx_lock(&c->lock);
105	c->done++;
106	mtx_unlock(&c->lock);
107	wakeup(c);
108}
109
110static void
111wait_for_completion(struct completion *c)
112{
113	mtx_lock(&c->lock);
114	while (c->done == 0)
115		mtx_sleep(c, &c->lock, 0, "hvwfc", 0);
116	c->done--;
117	mtx_unlock(&c->lock);
118}
119
120#define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (major)))
121
122enum {
123	PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
124	PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
125};
126
127#define PCI_CONFIG_MMIO_LENGTH	0x2000
128#define CFG_PAGE_OFFSET 0x1000
129#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
130
131/*
132 * Message Types
133 */
134
135enum pci_message_type {
136	/*
137	 * Version 1.1
138	 */
139	PCI_MESSAGE_BASE                = 0x42490000,
140	PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
141	PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
142	PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
143	PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
144	PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
145	PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
146	PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
147	PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
148	PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
149	PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
150	PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
151	PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
152	PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
153	PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
154	PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
155	PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
156	PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
157	PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
158	PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
159	PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
160	PCI_MESSAGE_MAXIMUM
161};
162
163/*
164 * Structures defining the virtual PCI Express protocol.
165 */
166
167union pci_version {
168	struct {
169		uint16_t minor_version;
170		uint16_t major_version;
171	} parts;
172	uint32_t version;
173} __packed;
174
175/*
176 * This representation is the one used in Windows, which is
177 * what is expected when sending this back and forth with
178 * the Hyper-V parent partition.
179 */
180union win_slot_encoding {
181	struct {
182		uint32_t	slot:5;
183		uint32_t	func:3;
184		uint32_t	reserved:24;
185	} bits;
186	uint32_t val;
187} __packed;
188
189struct pci_func_desc {
190	uint16_t	v_id;	/* vendor ID */
191	uint16_t	d_id;	/* device ID */
192	uint8_t		rev;
193	uint8_t		prog_intf;
194	uint8_t		subclass;
195	uint8_t		base_class;
196	uint32_t	subsystem_id;
197	union win_slot_encoding wslot;
198	uint32_t	ser;	/* serial number */
199} __packed;
200
201struct hv_msi_desc {
202	uint8_t		vector;
203	uint8_t		delivery_mode;
204	uint16_t	vector_count;
205	uint32_t	reserved;
206	uint64_t	cpu_mask;
207} __packed;
208
209struct tran_int_desc {
210	uint16_t	reserved;
211	uint16_t	vector_count;
212	uint32_t	data;
213	uint64_t	address;
214} __packed;
215
216struct pci_message {
217	uint32_t type;
218} __packed;
219
220struct pci_child_message {
221	struct pci_message message_type;
222	union win_slot_encoding wslot;
223} __packed;
224
225struct pci_incoming_message {
226	struct vmbus_chanpkt_hdr hdr;
227	struct pci_message message_type;
228} __packed;
229
230struct pci_response {
231	struct vmbus_chanpkt_hdr hdr;
232	int32_t status;	/* negative values are failures */
233} __packed;
234
235struct pci_packet {
236	void (*completion_func)(void *context, struct pci_response *resp,
237	    int resp_packet_size);
238	void *compl_ctxt;
239
240	struct pci_message message[0];
241};
242
243/*
244 * Specific message types supporting the PCI protocol.
245 */
246
247struct pci_version_request {
248	struct pci_message message_type;
249	uint32_t protocol_version;
250	uint32_t is_last_attempt:1;
251	uint32_t reservedz:31;
252} __packed;
253
254struct pci_bus_d0_entry {
255	struct pci_message message_type;
256	uint32_t reserved;
257	uint64_t mmio_base;
258} __packed;
259
260struct pci_bus_relations {
261	struct pci_incoming_message incoming;
262	uint32_t device_count;
263	struct pci_func_desc func[0];
264} __packed;
265
266#define MAX_NUM_BARS	(PCIR_MAX_BAR_0 + 1)
267struct pci_q_res_req_response {
268	struct vmbus_chanpkt_hdr hdr;
269	int32_t status; /* negative values are failures */
270	uint32_t probed_bar[MAX_NUM_BARS];
271} __packed;
272
273struct pci_resources_assigned {
274	struct pci_message message_type;
275	union win_slot_encoding wslot;
276	uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */
277	uint32_t msi_descriptors;
278	uint32_t reserved[4];
279} __packed;
280
281struct pci_create_interrupt {
282	struct pci_message message_type;
283	union win_slot_encoding wslot;
284	struct hv_msi_desc int_desc;
285} __packed;
286
287struct pci_create_int_response {
288	struct pci_response response;
289	uint32_t reserved;
290	struct tran_int_desc int_desc;
291} __packed;
292
293struct pci_delete_interrupt {
294	struct pci_message message_type;
295	union win_slot_encoding wslot;
296	struct tran_int_desc int_desc;
297} __packed;
298
299struct pci_dev_incoming {
300	struct pci_incoming_message incoming;
301	union win_slot_encoding wslot;
302} __packed;
303
304struct pci_eject_response {
305	struct pci_message message_type;
306	union win_slot_encoding wslot;
307	uint32_t status;
308} __packed;
309
310/*
311 * Driver specific state.
312 */
313
314enum hv_pcibus_state {
315	hv_pcibus_init = 0,
316	hv_pcibus_installed,
317};
318
319struct hv_pcibus {
320	device_t pcib;
321	device_t pci_bus;
322	struct vmbus_pcib_softc *sc;
323
324	uint16_t pci_domain;
325
326	enum hv_pcibus_state state;
327
328	struct resource *cfg_res;
329
330	struct completion query_completion, *query_comp;
331
332	struct mtx config_lock; /* Avoid two threads writing index page */
333	struct mtx device_list_lock;    /* Protect lists below */
334	TAILQ_HEAD(, hv_pci_dev) children;
335	TAILQ_HEAD(, hv_dr_state) dr_list;
336
337	volatile int detaching;
338};
339
340struct hv_pci_dev {
341	TAILQ_ENTRY(hv_pci_dev) link;
342
343	struct pci_func_desc desc;
344
345	bool reported_missing;
346
347	struct hv_pcibus *hbus;
348	struct task eject_task;
349
350	TAILQ_HEAD(, hv_irq_desc) irq_desc_list;
351
352	/*
353	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
354	 * read it back, for each of the BAR offsets within config space.
355	 */
356	uint32_t probed_bar[MAX_NUM_BARS];
357};
358
359/*
360 * Tracks "Device Relations" messages from the host, which must be both
361 * processed in order.
362 */
363struct hv_dr_work {
364	struct task task;
365	struct hv_pcibus *bus;
366};
367
368struct hv_dr_state {
369	TAILQ_ENTRY(hv_dr_state) link;
370	uint32_t device_count;
371	struct pci_func_desc func[0];
372};
373
374struct hv_irq_desc {
375	TAILQ_ENTRY(hv_irq_desc) link;
376	struct tran_int_desc desc;
377	int irq;
378};
379
380#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
381#define PCI_SLOT(devfn)         (((devfn) >> 3) & 0x1f)
382#define PCI_FUNC(devfn)         ((devfn) & 0x07)
383
384static uint32_t
385devfn_to_wslot(unsigned int devfn)
386{
387	union win_slot_encoding wslot;
388
389	wslot.val = 0;
390	wslot.bits.slot = PCI_SLOT(devfn);
391	wslot.bits.func = PCI_FUNC(devfn);
392
393	return (wslot.val);
394}
395
396static unsigned int
397wslot_to_devfn(uint32_t wslot)
398{
399	union win_slot_encoding encoding;
400	unsigned int slot;
401	unsigned int func;
402
403	encoding.val = wslot;
404
405	slot = encoding.bits.slot;
406	func = encoding.bits.func;
407
408	return (PCI_DEVFN(slot, func));
409}
410
411struct vmbus_pcib_softc {
412	struct vmbus_channel	*chan;
413	void *rx_buf;
414
415	struct taskqueue	*taskq;
416
417	struct hv_pcibus	*hbus;
418};
419
420/* {44C4F61D-4444-4400-9D52-802E27EDE19F} */
421static const struct hyperv_guid g_pass_through_dev_type = {
422	.hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44,
423	    0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F}
424};
425
426struct hv_pci_compl {
427	struct completion host_event;
428	int32_t completion_status;
429};
430
431struct q_res_req_compl {
432	struct completion host_event;
433	struct hv_pci_dev *hpdev;
434};
435
436struct compose_comp_ctxt {
437	struct hv_pci_compl comp_pkt;
438	struct tran_int_desc int_desc;
439};
440
441static void
442hv_pci_generic_compl(void *context, struct pci_response *resp,
443    int resp_packet_size)
444{
445	struct hv_pci_compl *comp_pkt = context;
446
447	if (resp_packet_size >= sizeof(struct pci_response))
448		comp_pkt->completion_status = resp->status;
449	else
450		comp_pkt->completion_status = -1;
451
452	complete(&comp_pkt->host_event);
453}
454
455static void
456q_resource_requirements(void *context, struct pci_response *resp,
457    int resp_packet_size)
458{
459	struct q_res_req_compl *completion = context;
460	struct pci_q_res_req_response *q_res_req =
461	    (struct pci_q_res_req_response *)resp;
462	int i;
463
464	if (resp->status < 0) {
465		printf("vmbus_pcib: failed to query resource requirements\n");
466	} else {
467		for (i = 0; i < MAX_NUM_BARS; i++)
468			completion->hpdev->probed_bar[i] =
469			    q_res_req->probed_bar[i];
470	}
471
472	complete(&completion->host_event);
473}
474
475static void
476hv_pci_compose_compl(void *context, struct pci_response *resp,
477    int resp_packet_size)
478{
479	struct compose_comp_ctxt *comp_pkt = context;
480	struct pci_create_int_response *int_resp =
481	    (struct pci_create_int_response *)resp;
482
483	comp_pkt->comp_pkt.completion_status = resp->status;
484	comp_pkt->int_desc = int_resp->int_desc;
485	complete(&comp_pkt->comp_pkt.host_event);
486}
487
488static void
489hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid)
490{
491	struct pci_delete_interrupt *int_pkt;
492	struct {
493		struct pci_packet pkt;
494		uint8_t buffer[sizeof(struct pci_delete_interrupt)];
495	} ctxt;
496
497	memset(&ctxt, 0, sizeof(ctxt));
498	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
499	int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE;
500	int_pkt->wslot.val = hpdev->desc.wslot.val;
501	int_pkt->int_desc = hid->desc;
502
503	vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
504	    int_pkt, sizeof(*int_pkt), 0);
505
506	free(hid, M_DEVBUF);
507}
508
509static void
510hv_pci_delete_device(struct hv_pci_dev *hpdev)
511{
512	struct hv_pcibus *hbus = hpdev->hbus;
513	struct hv_irq_desc *hid, *tmp_hid;
514	device_t pci_dev;
515	int devfn;
516
517	devfn = wslot_to_devfn(hpdev->desc.wslot.val);
518
519	mtx_lock(&Giant);
520
521	pci_dev = pci_find_dbsf(hbus->pci_domain,
522	    0, PCI_SLOT(devfn), PCI_FUNC(devfn));
523	if (pci_dev)
524		device_delete_child(hbus->pci_bus, pci_dev);
525
526	mtx_unlock(&Giant);
527
528	mtx_lock(&hbus->device_list_lock);
529	TAILQ_REMOVE(&hbus->children, hpdev, link);
530	mtx_unlock(&hbus->device_list_lock);
531
532	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid)
533		hv_int_desc_free(hpdev, hid);
534
535	free(hpdev, M_DEVBUF);
536}
537
538static struct hv_pci_dev *
539new_pcichild_device(struct hv_pcibus *hbus, struct pci_func_desc *desc)
540{
541	struct hv_pci_dev *hpdev;
542	struct pci_child_message *res_req;
543	struct q_res_req_compl comp_pkt;
544	struct {
545		struct pci_packet pkt;
546		uint8_t buffer[sizeof(struct pci_child_message)];
547	} ctxt;
548	int ret;
549
550	hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO);
551	hpdev->hbus = hbus;
552
553	TAILQ_INIT(&hpdev->irq_desc_list);
554
555	init_completion(&comp_pkt.host_event);
556	comp_pkt.hpdev = hpdev;
557
558	ctxt.pkt.compl_ctxt = &comp_pkt;
559	ctxt.pkt.completion_func = q_resource_requirements;
560
561	res_req = (struct pci_child_message *)&ctxt.pkt.message;
562	res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
563	res_req->wslot.val = desc->wslot.val;
564
565	ret = vmbus_chan_send(hbus->sc->chan,
566	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
567	    res_req, sizeof(*res_req), (uint64_t)(uintptr_t)&ctxt.pkt);
568	if (ret)
569		goto err;
570
571	wait_for_completion(&comp_pkt.host_event);
572	free_completion(&comp_pkt.host_event);
573
574	hpdev->desc = *desc;
575
576	mtx_lock(&hbus->device_list_lock);
577	if (TAILQ_EMPTY(&hbus->children))
578		hbus->pci_domain = desc->ser & 0xFFFF;
579	TAILQ_INSERT_TAIL(&hbus->children, hpdev, link);
580	mtx_unlock(&hbus->device_list_lock);
581	return (hpdev);
582err:
583	free_completion(&comp_pkt.host_event);
584	free(hpdev, M_DEVBUF);
585	return (NULL);
586}
587
588#if __FreeBSD_version < 1100000
589
590/* Old versions don't have BUS_RESCAN(). Let's copy it from FreeBSD 11. */
591
592static struct pci_devinfo *
593pci_identify_function(device_t pcib, device_t dev, int domain, int busno,
594    int slot, int func, size_t dinfo_size)
595{
596	struct pci_devinfo *dinfo;
597
598	dinfo = pci_read_device(pcib, domain, busno, slot, func, dinfo_size);
599	if (dinfo != NULL)
600		pci_add_child(dev, dinfo);
601
602	return (dinfo);
603}
604
605static int
606pci_rescan(device_t dev)
607{
608#define	REG(n, w)	PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
609	device_t pcib = device_get_parent(dev);
610	struct pci_softc *sc;
611	device_t child, *devlist, *unchanged;
612	int devcount, error, i, j, maxslots, oldcount;
613	int busno, domain, s, f, pcifunchigh;
614	uint8_t hdrtype;
615
616	/* No need to check for ARI on a rescan. */
617	error = device_get_children(dev, &devlist, &devcount);
618	if (error)
619		return (error);
620	if (devcount != 0) {
621		unchanged = malloc(devcount * sizeof(device_t), M_TEMP,
622		    M_NOWAIT | M_ZERO);
623		if (unchanged == NULL) {
624			free(devlist, M_TEMP);
625			return (ENOMEM);
626		}
627	} else
628		unchanged = NULL;
629
630	sc = device_get_softc(dev);
631	domain = pcib_get_domain(dev);
632	busno = pcib_get_bus(dev);
633	maxslots = PCIB_MAXSLOTS(pcib);
634	for (s = 0; s <= maxslots; s++) {
635		/* If function 0 is not present, skip to the next slot. */
636		f = 0;
637		if (REG(PCIR_VENDOR, 2) == 0xffff)
638			continue;
639		pcifunchigh = 0;
640		hdrtype = REG(PCIR_HDRTYPE, 1);
641		if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
642			continue;
643		if (hdrtype & PCIM_MFDEV)
644			pcifunchigh = PCIB_MAXFUNCS(pcib);
645		for (f = 0; f <= pcifunchigh; f++) {
646			if (REG(PCIR_VENDOR, 2) == 0xffff)
647				continue;
648
649			/*
650			 * Found a valid function.  Check if a
651			 * device_t for this device already exists.
652			 */
653			for (i = 0; i < devcount; i++) {
654				child = devlist[i];
655				if (child == NULL)
656					continue;
657				if (pci_get_slot(child) == s &&
658				    pci_get_function(child) == f) {
659					unchanged[i] = child;
660					goto next_func;
661				}
662			}
663
664			pci_identify_function(pcib, dev, domain, busno, s, f,
665			    sizeof(struct pci_devinfo));
666		next_func:;
667		}
668	}
669
670	/* Remove devices that are no longer present. */
671	for (i = 0; i < devcount; i++) {
672		if (unchanged[i] != NULL)
673			continue;
674		device_delete_child(dev, devlist[i]);
675	}
676
677	free(devlist, M_TEMP);
678	oldcount = devcount;
679
680	/* Try to attach the devices just added. */
681	error = device_get_children(dev, &devlist, &devcount);
682	if (error) {
683		free(unchanged, M_TEMP);
684		return (error);
685	}
686
687	for (i = 0; i < devcount; i++) {
688		for (j = 0; j < oldcount; j++) {
689			if (devlist[i] == unchanged[j])
690				goto next_device;
691		}
692
693		device_probe_and_attach(devlist[i]);
694	next_device:;
695	}
696
697	free(unchanged, M_TEMP);
698	free(devlist, M_TEMP);
699	return (0);
700#undef REG
701}
702
703#else
704
705static int
706pci_rescan(device_t dev)
707{
708	return (BUS_RESCAN(dev));
709}
710
711#endif
712
713static void
714pci_devices_present_work(void *arg, int pending __unused)
715{
716	struct hv_dr_work *dr_wrk = arg;
717	struct hv_dr_state *dr = NULL;
718	struct hv_pcibus *hbus;
719	uint32_t child_no;
720	bool found;
721	struct pci_func_desc *new_desc;
722	struct hv_pci_dev *hpdev, *tmp_hpdev;
723	struct completion *query_comp;
724	bool need_rescan = false;
725
726	hbus = dr_wrk->bus;
727	free(dr_wrk, M_DEVBUF);
728
729	/* Pull this off the queue and process it if it was the last one. */
730	mtx_lock(&hbus->device_list_lock);
731	while (!TAILQ_EMPTY(&hbus->dr_list)) {
732		dr = TAILQ_FIRST(&hbus->dr_list);
733		TAILQ_REMOVE(&hbus->dr_list, dr, link);
734
735		/* Throw this away if the list still has stuff in it. */
736		if (!TAILQ_EMPTY(&hbus->dr_list)) {
737			free(dr, M_DEVBUF);
738			continue;
739		}
740	}
741	mtx_unlock(&hbus->device_list_lock);
742
743	if (!dr)
744		return;
745
746	/* First, mark all existing children as reported missing. */
747	mtx_lock(&hbus->device_list_lock);
748	TAILQ_FOREACH(hpdev, &hbus->children, link)
749		hpdev->reported_missing = true;
750	mtx_unlock(&hbus->device_list_lock);
751
752	/* Next, add back any reported devices. */
753	for (child_no = 0; child_no < dr->device_count; child_no++) {
754		found = false;
755		new_desc = &dr->func[child_no];
756
757		mtx_lock(&hbus->device_list_lock);
758		TAILQ_FOREACH(hpdev, &hbus->children, link) {
759			if ((hpdev->desc.wslot.val ==
760			    new_desc->wslot.val) &&
761			    (hpdev->desc.v_id == new_desc->v_id) &&
762			    (hpdev->desc.d_id == new_desc->d_id) &&
763			    (hpdev->desc.ser == new_desc->ser)) {
764				hpdev->reported_missing = false;
765				found = true;
766				break;
767			}
768		}
769		mtx_unlock(&hbus->device_list_lock);
770
771		if (!found) {
772			if (!need_rescan)
773				need_rescan = true;
774
775			hpdev = new_pcichild_device(hbus, new_desc);
776			if (!hpdev)
777				printf("vmbus_pcib: failed to add a child\n");
778		}
779	}
780
781	/* Remove missing device(s), if any */
782	TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) {
783		if (hpdev->reported_missing)
784			hv_pci_delete_device(hpdev);
785	}
786
787	/* Rescan the bus to find any new device, if necessary. */
788	if (hbus->state == hv_pcibus_installed && need_rescan)
789		pci_rescan(hbus->pci_bus);
790
791	/* Wake up hv_pci_query_relations(), if it's waiting. */
792	query_comp = hbus->query_comp;
793	if (query_comp) {
794		hbus->query_comp = NULL;
795		complete(query_comp);
796	}
797
798	free(dr, M_DEVBUF);
799}
800
801static struct hv_pci_dev *
802get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot)
803{
804	struct hv_pci_dev *hpdev, *ret = NULL;
805
806	mtx_lock(&hbus->device_list_lock);
807	TAILQ_FOREACH(hpdev, &hbus->children, link) {
808		if (hpdev->desc.wslot.val == wslot) {
809			ret = hpdev;
810			break;
811		}
812	}
813	mtx_unlock(&hbus->device_list_lock);
814
815	return (ret);
816}
817
818static void
819hv_pci_devices_present(struct hv_pcibus *hbus,
820    struct pci_bus_relations *relations)
821{
822	struct hv_dr_state *dr;
823	struct hv_dr_work *dr_wrk;
824	unsigned long dr_size;
825
826	if (hbus->detaching && relations->device_count > 0)
827		return;
828
829	dr_size = offsetof(struct hv_dr_state, func) +
830	    (sizeof(struct pci_func_desc) * relations->device_count);
831	dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO);
832
833	dr->device_count = relations->device_count;
834	if (dr->device_count != 0)
835		memcpy(dr->func, relations->func,
836		    sizeof(struct pci_func_desc) * dr->device_count);
837
838	mtx_lock(&hbus->device_list_lock);
839	TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link);
840	mtx_unlock(&hbus->device_list_lock);
841
842	dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO);
843	dr_wrk->bus = hbus;
844	TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk);
845	taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task);
846}
847
848static void
849hv_eject_device_work(void *arg, int pending __unused)
850{
851	struct hv_pci_dev *hpdev = arg;
852	union win_slot_encoding wslot = hpdev->desc.wslot;
853	struct hv_pcibus *hbus = hpdev->hbus;
854	struct pci_eject_response *eject_pkt;
855	struct {
856		struct pci_packet pkt;
857		uint8_t buffer[sizeof(struct pci_eject_response)];
858	} ctxt;
859
860	hv_pci_delete_device(hpdev);
861
862	memset(&ctxt, 0, sizeof(ctxt));
863	eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
864	eject_pkt->message_type.type = PCI_EJECTION_COMPLETE;
865	eject_pkt->wslot.val = wslot.val;
866	vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
867	    eject_pkt, sizeof(*eject_pkt), 0);
868}
869
870static void
871hv_pci_eject_device(struct hv_pci_dev *hpdev)
872{
873	struct hv_pcibus *hbus = hpdev->hbus;
874	struct taskqueue *taskq;
875
876	if (hbus->detaching)
877		return;
878
879	/*
880	 * Push this task into the same taskqueue on which
881	 * vmbus_pcib_attach() runs, so we're sure this task can't run
882	 * concurrently with vmbus_pcib_attach().
883	 */
884	TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev);
885	taskq = vmbus_chan_mgmt_tq(hbus->sc->chan);
886	taskqueue_enqueue(taskq, &hpdev->eject_task);
887}
888
889#define PCIB_PACKET_SIZE	0x100
890
891static void
892vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg)
893{
894	struct vmbus_pcib_softc *sc = arg;
895	struct hv_pcibus *hbus = sc->hbus;
896
897	void *buffer;
898	int bufferlen = PCIB_PACKET_SIZE;
899
900	struct pci_packet *comp_packet;
901	struct pci_response *response;
902	struct pci_incoming_message *new_msg;
903	struct pci_bus_relations *bus_rel;
904	struct pci_dev_incoming *dev_msg;
905	struct hv_pci_dev *hpdev;
906
907	buffer = sc->rx_buf;
908	do {
909		struct vmbus_chanpkt_hdr *pkt = buffer;
910		uint32_t bytes_rxed;
911		int ret;
912
913		bytes_rxed = bufferlen;
914		ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
915
916		if (ret == ENOBUFS) {
917			/* Handle large packet */
918			if (bufferlen > PCIB_PACKET_SIZE) {
919				free(buffer, M_DEVBUF);
920				buffer = NULL;
921			}
922
923			/* alloc new buffer */
924			buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO);
925			bufferlen = bytes_rxed;
926
927			continue;
928		}
929
930		if (ret != 0) {
931			/* ignore EIO or EAGAIN */
932			break;
933		}
934
935		if (bytes_rxed <= sizeof(struct pci_response))
936			continue;
937
938		switch (pkt->cph_type) {
939		case VMBUS_CHANPKT_TYPE_COMP:
940			comp_packet =
941			    (struct pci_packet *)(uintptr_t)pkt->cph_xactid;
942			response = (struct pci_response *)pkt;
943			comp_packet->completion_func(comp_packet->compl_ctxt,
944			    response, bytes_rxed);
945			break;
946		case VMBUS_CHANPKT_TYPE_INBAND:
947			new_msg = (struct pci_incoming_message *)buffer;
948
949			switch (new_msg->message_type.type) {
950			case PCI_BUS_RELATIONS:
951				bus_rel = (struct pci_bus_relations *)buffer;
952
953				if (bus_rel->device_count == 0)
954					break;
955
956				if (bytes_rxed <
957				    offsetof(struct pci_bus_relations, func) +
958				        (sizeof(struct pci_func_desc) *
959				            (bus_rel->device_count)))
960					break;
961
962				hv_pci_devices_present(hbus, bus_rel);
963				break;
964
965			case PCI_EJECT:
966				dev_msg = (struct pci_dev_incoming *)buffer;
967				hpdev = get_pcichild_wslot(hbus,
968				    dev_msg->wslot.val);
969
970				if (hpdev)
971					hv_pci_eject_device(hpdev);
972
973				break;
974			default:
975				printf("vmbus_pcib: Unknown msg type 0x%x\n",
976				    new_msg->message_type.type);
977				break;
978			}
979			break;
980		default:
981			printf("vmbus_pcib: Unknown VMBus msg type %hd\n",
982			    pkt->cph_type);
983			break;
984		}
985	} while (1);
986
987	if (bufferlen > PCIB_PACKET_SIZE)
988		free(buffer, M_DEVBUF);
989}
990
991static int
992hv_pci_protocol_negotiation(struct hv_pcibus *hbus)
993{
994	struct pci_version_request *version_req;
995	struct hv_pci_compl comp_pkt;
996	struct {
997		struct pci_packet pkt;
998		uint8_t buffer[sizeof(struct pci_version_request)];
999	} ctxt;
1000	int ret;
1001
1002	init_completion(&comp_pkt.host_event);
1003
1004	ctxt.pkt.completion_func = hv_pci_generic_compl;
1005	ctxt.pkt.compl_ctxt = &comp_pkt;
1006	version_req = (struct pci_version_request *)&ctxt.pkt.message;
1007	version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
1008	version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
1009	version_req->is_last_attempt = 1;
1010
1011	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
1012	    VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req),
1013	    (uint64_t)(uintptr_t)&ctxt.pkt);
1014	if (ret)
1015		goto out;
1016
1017	wait_for_completion(&comp_pkt.host_event);
1018
1019	if (comp_pkt.completion_status < 0) {
1020		device_printf(hbus->pcib,
1021		    "vmbus_pcib version negotiation failed: %x\n",
1022		    comp_pkt.completion_status);
1023		ret = EPROTO;
1024	} else {
1025		ret = 0;
1026	}
1027out:
1028	free_completion(&comp_pkt.host_event);
1029	return (ret);
1030}
1031
1032/* Ask the host to send along the list of child devices */
1033static int
1034hv_pci_query_relations(struct hv_pcibus *hbus)
1035{
1036	struct pci_message message;
1037	int ret;
1038
1039	message.type = PCI_QUERY_BUS_RELATIONS;
1040	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
1041	    &message, sizeof(message), 0);
1042	return (ret);
1043}
1044
1045static int
1046hv_pci_enter_d0(struct hv_pcibus *hbus)
1047{
1048	struct pci_bus_d0_entry *d0_entry;
1049	struct hv_pci_compl comp_pkt;
1050	struct {
1051		struct pci_packet pkt;
1052		uint8_t buffer[sizeof(struct pci_bus_d0_entry)];
1053	} ctxt;
1054	int ret;
1055
1056	/*
1057	 * Tell the host that the bus is ready to use, and moved into the
1058	 * powered-on state.  This includes telling the host which region
1059	 * of memory-mapped I/O space has been chosen for configuration space
1060	 * access.
1061	 */
1062	init_completion(&comp_pkt.host_event);
1063
1064	ctxt.pkt.completion_func = hv_pci_generic_compl;
1065	ctxt.pkt.compl_ctxt = &comp_pkt;
1066
1067	d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message;
1068	memset(d0_entry, 0, sizeof(*d0_entry));
1069	d0_entry->message_type.type = PCI_BUS_D0ENTRY;
1070	d0_entry->mmio_base = rman_get_start(hbus->cfg_res);
1071
1072	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
1073	    VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry),
1074	    (uint64_t)(uintptr_t)&ctxt.pkt);
1075	if (ret)
1076		goto out;
1077
1078	wait_for_completion(&comp_pkt.host_event);
1079
1080	if (comp_pkt.completion_status < 0) {
1081		device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n");
1082		ret = EPROTO;
1083	} else {
1084		ret = 0;
1085	}
1086
1087out:
1088	free_completion(&comp_pkt.host_event);
1089	return (ret);
1090}
1091
1092/*
1093 * It looks this is only needed by Windows VM, but let's send the message too
1094 * just to make the host happy.
1095 */
1096static int
1097hv_send_resources_allocated(struct hv_pcibus *hbus)
1098{
1099	struct pci_resources_assigned *res_assigned;
1100	struct hv_pci_compl comp_pkt;
1101	struct hv_pci_dev *hpdev;
1102	struct pci_packet *pkt;
1103	uint32_t wslot;
1104	int ret = 0;
1105
1106	pkt = malloc(sizeof(*pkt) + sizeof(*res_assigned),
1107	    M_DEVBUF, M_WAITOK | M_ZERO);
1108
1109	for (wslot = 0; wslot < 256; wslot++) {
1110		hpdev = get_pcichild_wslot(hbus, wslot);
1111		if (!hpdev)
1112			continue;
1113
1114		init_completion(&comp_pkt.host_event);
1115
1116		memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
1117		pkt->completion_func = hv_pci_generic_compl;
1118		pkt->compl_ctxt = &comp_pkt;
1119
1120		res_assigned = (struct pci_resources_assigned *)&pkt->message;
1121		res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED;
1122		res_assigned->wslot.val = hpdev->desc.wslot.val;
1123
1124		ret = vmbus_chan_send(hbus->sc->chan,
1125		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
1126		    &pkt->message, sizeof(*res_assigned),
1127		    (uint64_t)(uintptr_t)pkt);
1128		if (ret) {
1129			free_completion(&comp_pkt.host_event);
1130			break;
1131		}
1132
1133		wait_for_completion(&comp_pkt.host_event);
1134		free_completion(&comp_pkt.host_event);
1135
1136		if (comp_pkt.completion_status < 0) {
1137			ret = EPROTO;
1138			device_printf(hbus->pcib,
1139			    "failed to send PCI_RESOURCES_ASSIGNED\n");
1140			break;
1141		}
1142	}
1143
1144	free(pkt, M_DEVBUF);
1145	return (ret);
1146}
1147
1148static int
1149hv_send_resources_released(struct hv_pcibus *hbus)
1150{
1151	struct pci_child_message pkt;
1152	struct hv_pci_dev *hpdev;
1153	uint32_t wslot;
1154	int ret;
1155
1156	for (wslot = 0; wslot < 256; wslot++) {
1157		hpdev = get_pcichild_wslot(hbus, wslot);
1158		if (!hpdev)
1159			continue;
1160
1161		pkt.message_type.type = PCI_RESOURCES_RELEASED;
1162		pkt.wslot.val = hpdev->desc.wslot.val;
1163
1164		ret = vmbus_chan_send(hbus->sc->chan,
1165		    VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0);
1166		if (ret)
1167			return (ret);
1168	}
1169
1170	return (0);
1171}
1172
1173#define hv_cfg_read(x, s)						\
1174static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus,	\
1175    bus_size_t offset)							\
1176{									\
1177	return (bus_read_##s(bus->cfg_res, offset));			\
1178}
1179
1180#define hv_cfg_write(x, s)						\
1181static inline void hv_cfg_write_##s(struct hv_pcibus *bus,		\
1182    bus_size_t offset, uint##x##_t val)					\
1183{									\
1184	return (bus_write_##s(bus->cfg_res, offset, val));		\
1185}
1186
1187hv_cfg_read(8, 1)
1188hv_cfg_read(16, 2)
1189hv_cfg_read(32, 4)
1190
1191hv_cfg_write(8, 1)
1192hv_cfg_write(16, 2)
1193hv_cfg_write(32, 4)
1194
1195static void
1196_hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size,
1197    uint32_t *val)
1198{
1199	struct hv_pcibus *hbus = hpdev->hbus;
1200	bus_size_t addr = CFG_PAGE_OFFSET + where;
1201
1202	/*
1203	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
1204	 */
1205	if (where + size <= PCIR_COMMAND) {
1206		memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size);
1207	} else if (where >= PCIR_REVID && where + size <=
1208		   PCIR_CACHELNSZ) {
1209		memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where -
1210		       PCIR_REVID, size);
1211	} else if (where >= PCIR_SUBVEND_0 && where + size <=
1212		   PCIR_BIOS) {
1213		memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where -
1214		       PCIR_SUBVEND_0, size);
1215	} else if (where >= PCIR_BIOS && where + size <=
1216		   PCIR_CAP_PTR) {
1217		/* ROM BARs are unimplemented */
1218		*val = 0;
1219	} else if ((where >= PCIR_INTLINE && where + size <=
1220		   PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) {
1221		/*
1222		 * Interrupt Line and Interrupt PIN are hard-wired to zero
1223		 * because this front-end only supports message-signaled
1224		 * interrupts.
1225		 */
1226		*val = 0;
1227	} else if (where + size <= CFG_PAGE_SIZE) {
1228		mtx_lock(&hbus->config_lock);
1229
1230		/* Choose the function to be read. */
1231		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
1232
1233		/* Make sure the function was chosen before we start reading.*/
1234		mb();
1235
1236		/* Read from that function's config space. */
1237		switch (size) {
1238		case 1:
1239			*((uint8_t *)val) = hv_cfg_read_1(hbus, addr);
1240			break;
1241		case 2:
1242			*((uint16_t *)val) = hv_cfg_read_2(hbus, addr);
1243			break;
1244		default:
1245			*((uint32_t *)val) = hv_cfg_read_4(hbus, addr);
1246			break;
1247		}
1248		/*
1249		 * Make sure the write was done before we release the lock,
1250		 * allowing consecutive reads/writes.
1251		 */
1252		mb();
1253
1254		mtx_unlock(&hbus->config_lock);
1255	} else {
1256		/* Invalid config read: it's unlikely to reach here. */
1257		memset(val, 0, size);
1258	}
1259}
1260
1261static void
1262_hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size,
1263    uint32_t val)
1264{
1265	struct hv_pcibus *hbus = hpdev->hbus;
1266	bus_size_t addr = CFG_PAGE_OFFSET + where;
1267
1268	/* SSIDs and ROM BARs are read-only */
1269	if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR)
1270		return;
1271
1272	if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) {
1273		mtx_lock(&hbus->config_lock);
1274
1275		/* Choose the function to be written. */
1276		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
1277
1278		/* Make sure the function was chosen before we start writing.*/
1279		wmb();
1280
1281		/* Write to that function's config space. */
1282		switch (size) {
1283		case 1:
1284			hv_cfg_write_1(hbus, addr, (uint8_t)val);
1285			break;
1286		case 2:
1287			hv_cfg_write_2(hbus, addr, (uint16_t)val);
1288			break;
1289		default:
1290			hv_cfg_write_4(hbus, addr, (uint32_t)val);
1291			break;
1292		}
1293
1294		/*
1295		 * Make sure the write was done before we release the lock,
1296		 * allowing consecutive reads/writes.
1297		 */
1298		mb();
1299
1300		mtx_unlock(&hbus->config_lock);
1301	} else {
1302		/* Invalid config write: it's unlikely to reach here. */
1303		return;
1304	}
1305}
1306
1307static void
1308vmbus_pcib_set_detaching(void *arg, int pending __unused)
1309{
1310	struct hv_pcibus *hbus = arg;
1311
1312	atomic_set_int(&hbus->detaching, 1);
1313}
1314
1315static void
1316vmbus_pcib_pre_detach(struct hv_pcibus *hbus)
1317{
1318	struct task task;
1319
1320	TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus);
1321
1322	/*
1323	 * Make sure the channel callback won't push any possible new
1324	 * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq.
1325	 */
1326	vmbus_chan_run_task(hbus->sc->chan, &task);
1327
1328	taskqueue_drain_all(hbus->sc->taskq);
1329}
1330
1331
1332/*
1333 * Standard probe entry point.
1334 *
1335 */
1336static int
1337vmbus_pcib_probe(device_t dev)
1338{
1339	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1340	    &g_pass_through_dev_type) == 0) {
1341		device_set_desc(dev, "Hyper-V PCI Express Pass Through");
1342		return (BUS_PROBE_DEFAULT);
1343	}
1344	return (ENXIO);
1345}
1346
1347/*
1348 * Standard attach entry point.
1349 *
1350 */
1351static int
1352vmbus_pcib_attach(device_t dev)
1353{
1354	const int pci_ring_size = (4 * PAGE_SIZE);
1355	const struct hyperv_guid *inst_guid;
1356	struct vmbus_channel *channel;
1357	struct vmbus_pcib_softc *sc;
1358	struct hv_pcibus *hbus;
1359	int rid = 0;
1360	int ret;
1361
1362	hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO);
1363	hbus->pcib = dev;
1364
1365	channel = vmbus_get_channel(dev);
1366	inst_guid = vmbus_chan_guid_inst(channel);
1367	hbus->pci_domain = inst_guid->hv_guid[9] |
1368			  (inst_guid->hv_guid[8] << 8);
1369
1370	mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF);
1371	mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF);
1372	TAILQ_INIT(&hbus->children);
1373	TAILQ_INIT(&hbus->dr_list);
1374
1375	hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid,
1376	    0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH,
1377	    RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE));
1378
1379	if (!hbus->cfg_res) {
1380		device_printf(dev, "failed to get resource for cfg window\n");
1381		ret = ENXIO;
1382		goto free_bus;
1383	}
1384
1385	sc = device_get_softc(dev);
1386	sc->chan = channel;
1387	sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
1388	sc->hbus = hbus;
1389
1390	/*
1391	 * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT
1392	 * messages. NB: we can't handle the messages in the channel callback
1393	 * directly, because the message handlers need to send new messages
1394	 * to the host and waits for the host's completion messages, which
1395	 * must also be handled by the channel callback.
1396	 */
1397	sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK,
1398	    taskqueue_thread_enqueue, &sc->taskq);
1399	taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq");
1400
1401	hbus->sc = sc;
1402
1403	init_completion(&hbus->query_completion);
1404	hbus->query_comp = &hbus->query_completion;
1405
1406	ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size,
1407		NULL, 0, vmbus_pcib_on_channel_callback, sc);
1408	if (ret)
1409		goto free_res;
1410
1411	ret = hv_pci_protocol_negotiation(hbus);
1412	if (ret)
1413		goto vmbus_close;
1414
1415	ret = hv_pci_query_relations(hbus);
1416	if (ret)
1417		goto vmbus_close;
1418	wait_for_completion(hbus->query_comp);
1419
1420	ret = hv_pci_enter_d0(hbus);
1421	if (ret)
1422		goto vmbus_close;
1423
1424	ret = hv_send_resources_allocated(hbus);
1425	if (ret)
1426		goto vmbus_close;
1427
1428	hbus->pci_bus = device_add_child(dev, "pci", -1);
1429	if (!hbus->pci_bus) {
1430		device_printf(dev, "failed to create pci bus\n");
1431		ret = ENXIO;
1432		goto vmbus_close;
1433	}
1434
1435	bus_generic_attach(dev);
1436
1437	hbus->state = hv_pcibus_installed;
1438
1439	return (0);
1440
1441vmbus_close:
1442	vmbus_pcib_pre_detach(hbus);
1443	vmbus_chan_close(sc->chan);
1444free_res:
1445	taskqueue_free(sc->taskq);
1446	free_completion(&hbus->query_completion);
1447	free(sc->rx_buf, M_DEVBUF);
1448	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
1449free_bus:
1450	mtx_destroy(&hbus->device_list_lock);
1451	mtx_destroy(&hbus->config_lock);
1452	free(hbus, M_DEVBUF);
1453	return (ret);
1454}
1455
1456/*
1457 * Standard detach entry point
1458 */
1459static int
1460vmbus_pcib_detach(device_t dev)
1461{
1462	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1463	struct hv_pcibus *hbus = sc->hbus;
1464	struct pci_message teardown_packet;
1465	struct pci_bus_relations relations;
1466	int ret;
1467
1468	vmbus_pcib_pre_detach(hbus);
1469
1470	if (hbus->state == hv_pcibus_installed)
1471		bus_generic_detach(dev);
1472
1473	/* Delete any children which might still exist. */
1474	memset(&relations, 0, sizeof(relations));
1475	hv_pci_devices_present(hbus, &relations);
1476
1477	ret = hv_send_resources_released(hbus);
1478	if (ret)
1479		device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n");
1480
1481	teardown_packet.type = PCI_BUS_D0EXIT;
1482	ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
1483	    &teardown_packet, sizeof(struct pci_message), 0);
1484	if (ret)
1485		device_printf(dev, "failed to send PCI_BUS_D0EXIT\n");
1486
1487	taskqueue_drain_all(hbus->sc->taskq);
1488	vmbus_chan_close(sc->chan);
1489	taskqueue_free(sc->taskq);
1490
1491	free_completion(&hbus->query_completion);
1492	free(sc->rx_buf, M_DEVBUF);
1493	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
1494
1495	mtx_destroy(&hbus->device_list_lock);
1496	mtx_destroy(&hbus->config_lock);
1497	free(hbus, M_DEVBUF);
1498
1499	return (0);
1500}
1501
1502static int
1503vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val)
1504{
1505	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1506
1507	switch (which) {
1508	case PCIB_IVAR_DOMAIN:
1509		*val = sc->hbus->pci_domain;
1510		return (0);
1511
1512	case PCIB_IVAR_BUS:
1513		/* There is only bus 0. */
1514		*val = 0;
1515		return (0);
1516	}
1517	return (ENOENT);
1518}
1519
1520static int
1521vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val)
1522{
1523	return (ENOENT);
1524}
1525
1526static struct resource *
1527vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
1528	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
1529{
1530	unsigned int bar_no;
1531	struct hv_pci_dev *hpdev;
1532	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1533	struct resource *res;
1534	unsigned int devfn;
1535
1536	if (type == PCI_RES_BUS)
1537		return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid,
1538		    start, end, count, flags));
1539
1540	/* Devices with port I/O BAR are not supported. */
1541	if (type == SYS_RES_IOPORT)
1542		return (NULL);
1543
1544	if (type == SYS_RES_MEMORY) {
1545		devfn = PCI_DEVFN(pci_get_slot(child),
1546		    pci_get_function(child));
1547		hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1548		if (!hpdev)
1549			return (NULL);
1550
1551		bar_no = PCI_RID2BAR(*rid);
1552		if (bar_no >= MAX_NUM_BARS)
1553			return (NULL);
1554
1555		/* Make sure a 32-bit BAR gets a 32-bit address */
1556		if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64))
1557			end = ulmin(end, 0xFFFFFFFF);
1558	}
1559
1560	res = bus_generic_alloc_resource(dev, child, type, rid,
1561		start, end, count, flags);
1562	/*
1563	 * If this is a request for a specific range, assume it is
1564	 * correct and pass it up to the parent.
1565	 */
1566	if (res == NULL && start + count - 1 == end)
1567		res = bus_generic_alloc_resource(dev, child, type, rid,
1568		    start, end, count, flags);
1569	return (res);
1570}
1571
1572static int
1573vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid,
1574    struct resource *r)
1575{
1576	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1577
1578	if (type == PCI_RES_BUS)
1579		return (pci_domain_release_bus(sc->hbus->pci_domain, child,
1580		    rid, r));
1581
1582	if (type == SYS_RES_IOPORT)
1583		return (EINVAL);
1584
1585	return (bus_generic_release_resource(dev, child, type, rid, r));
1586}
1587
1588#if __FreeBSD_version >= 1100000
1589static int
1590vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op,
1591    size_t setsize, cpuset_t *cpuset)
1592{
1593	return (bus_get_cpus(pcib, op, setsize, cpuset));
1594}
1595#endif
1596
1597static uint32_t
1598vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func,
1599    u_int reg, int bytes)
1600{
1601	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1602	struct hv_pci_dev *hpdev;
1603	unsigned int devfn = PCI_DEVFN(slot, func);
1604	uint32_t data = 0;
1605
1606	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
1607
1608	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1609	if (!hpdev)
1610		return (~0);
1611
1612	_hv_pcifront_read_config(hpdev, reg, bytes, &data);
1613
1614	return (data);
1615}
1616
1617static void
1618vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func,
1619    u_int reg, uint32_t data, int bytes)
1620{
1621	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1622	struct hv_pci_dev *hpdev;
1623	unsigned int devfn = PCI_DEVFN(slot, func);
1624
1625	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
1626
1627	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1628	if (!hpdev)
1629		return;
1630
1631	_hv_pcifront_write_config(hpdev, reg, bytes, data);
1632}
1633
1634static int
1635vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin)
1636{
1637	/* We only support MSI/MSI-X and don't support INTx interrupt. */
1638	return (PCI_INVALID_IRQ);
1639}
1640
1641static int
1642vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count,
1643    int maxcount, int *irqs)
1644{
1645	return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount,
1646	    irqs));
1647}
1648
1649static int
1650vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
1651{
1652	return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs));
1653}
1654
1655static int
1656vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
1657{
1658	return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq));
1659}
1660
1661static int
1662vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq)
1663{
1664	return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq));
1665}
1666
1667#define	MSI_INTEL_ADDR_DEST	0x000ff000
1668#define	MSI_INTEL_DATA_INTVEC	IOART_INTVEC	/* Interrupt vector. */
1669#define	MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
1670
1671static int
1672vmbus_pcib_map_msi(device_t pcib, device_t child, int irq,
1673    uint64_t *addr, uint32_t *data)
1674{
1675	unsigned int devfn;
1676	struct hv_pci_dev *hpdev;
1677
1678	uint64_t v_addr;
1679	uint32_t v_data;
1680	struct hv_irq_desc *hid, *tmp_hid;
1681	unsigned int cpu, vcpu_id;
1682	unsigned int vector;
1683
1684	struct vmbus_pcib_softc *sc = device_get_softc(pcib);
1685	struct pci_create_interrupt *int_pkt;
1686	struct compose_comp_ctxt comp;
1687	struct {
1688		struct pci_packet pkt;
1689		uint8_t buffer[sizeof(struct pci_create_interrupt)];
1690	} ctxt;
1691
1692	int ret;
1693
1694	devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child));
1695	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1696	if (!hpdev)
1697		return (ENOENT);
1698
1699	ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq,
1700	    &v_addr, &v_data);
1701	if (ret)
1702		return (ret);
1703
1704	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) {
1705		if (hid->irq == irq) {
1706			TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link);
1707			hv_int_desc_free(hpdev, hid);
1708			break;
1709		}
1710	}
1711
1712	cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12;
1713	vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu);
1714	vector = v_data & MSI_INTEL_DATA_INTVEC;
1715
1716	init_completion(&comp.comp_pkt.host_event);
1717
1718	memset(&ctxt, 0, sizeof(ctxt));
1719	ctxt.pkt.completion_func = hv_pci_compose_compl;
1720	ctxt.pkt.compl_ctxt = &comp;
1721
1722	int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
1723	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1724	int_pkt->wslot.val = hpdev->desc.wslot.val;
1725	int_pkt->int_desc.vector = vector;
1726	int_pkt->int_desc.vector_count = 1;
1727	int_pkt->int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED;
1728	int_pkt->int_desc.cpu_mask = 1ULL << vcpu_id;
1729
1730	ret = vmbus_chan_send(sc->chan,	VMBUS_CHANPKT_TYPE_INBAND,
1731	    VMBUS_CHANPKT_FLAG_RC, int_pkt, sizeof(*int_pkt),
1732	    (uint64_t)(uintptr_t)&ctxt.pkt);
1733	if (ret) {
1734		free_completion(&comp.comp_pkt.host_event);
1735		return (ret);
1736	}
1737
1738	wait_for_completion(&comp.comp_pkt.host_event);
1739	free_completion(&comp.comp_pkt.host_event);
1740
1741	if (comp.comp_pkt.completion_status < 0)
1742		return (EPROTO);
1743
1744	*addr = comp.int_desc.address;
1745	*data = comp.int_desc.data;
1746
1747	hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO);
1748	hid->irq = irq;
1749	hid->desc = comp.int_desc;
1750	TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link);
1751
1752	return (0);
1753}
1754
1755static device_method_t vmbus_pcib_methods[] = {
1756	/* Device interface */
1757	DEVMETHOD(device_probe,         vmbus_pcib_probe),
1758	DEVMETHOD(device_attach,        vmbus_pcib_attach),
1759	DEVMETHOD(device_detach,        vmbus_pcib_detach),
1760	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
1761	DEVMETHOD(device_suspend,	bus_generic_suspend),
1762	DEVMETHOD(device_resume,	bus_generic_resume),
1763
1764	/* Bus interface */
1765	DEVMETHOD(bus_read_ivar,		vmbus_pcib_read_ivar),
1766	DEVMETHOD(bus_write_ivar,		vmbus_pcib_write_ivar),
1767	DEVMETHOD(bus_alloc_resource,		vmbus_pcib_alloc_resource),
1768	DEVMETHOD(bus_release_resource,		vmbus_pcib_release_resource),
1769	DEVMETHOD(bus_activate_resource,   bus_generic_activate_resource),
1770	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
1771	DEVMETHOD(bus_setup_intr,	   bus_generic_setup_intr),
1772	DEVMETHOD(bus_teardown_intr,	   bus_generic_teardown_intr),
1773#if __FreeBSD_version >= 1100000
1774	DEVMETHOD(bus_get_cpus,			vmbus_pcib_get_cpus),
1775#endif
1776
1777	/* pcib interface */
1778	DEVMETHOD(pcib_maxslots,		pcib_maxslots),
1779	DEVMETHOD(pcib_read_config,		vmbus_pcib_read_config),
1780	DEVMETHOD(pcib_write_config,		vmbus_pcib_write_config),
1781	DEVMETHOD(pcib_route_interrupt,		vmbus_pcib_route_intr),
1782	DEVMETHOD(pcib_alloc_msi,		vmbus_pcib_alloc_msi),
1783	DEVMETHOD(pcib_release_msi,		vmbus_pcib_release_msi),
1784	DEVMETHOD(pcib_alloc_msix,		vmbus_pcib_alloc_msix),
1785	DEVMETHOD(pcib_release_msix,		vmbus_pcib_release_msix),
1786	DEVMETHOD(pcib_map_msi,			vmbus_pcib_map_msi),
1787
1788	DEVMETHOD_END
1789};
1790
1791static devclass_t pcib_devclass;
1792
1793DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods,
1794		sizeof(struct vmbus_pcib_softc));
1795DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, pcib_devclass, 0, 0);
1796MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1);
1797MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1);
1798
1799#endif /* NEW_PCIB */
1800