1// SPDX-License-Identifier: GPL-2.0
2/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
3
4#include <linux/interval_tree.h>
5#include <linux/vfio.h>
6
7#include <linux/pds/pds_common.h>
8#include <linux/pds/pds_core_if.h>
9#include <linux/pds/pds_adminq.h>
10
11#include "vfio_dev.h"
12#include "cmds.h"
13#include "dirty.h"
14
15#define READ_SEQ true
16#define WRITE_ACK false
17
18bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio)
19{
20	return pds_vfio->dirty.is_enabled;
21}
22
23void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio)
24{
25	pds_vfio->dirty.is_enabled = true;
26}
27
28void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio)
29{
30	pds_vfio->dirty.is_enabled = false;
31}
32
33static void
34pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
35				 u8 max_regions)
36{
37	int len = max_regions * sizeof(struct pds_lm_dirty_region_info);
38	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
39	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
40	struct pds_lm_dirty_region_info *region_info;
41	dma_addr_t regions_dma;
42	u8 num_regions;
43	int err;
44
45	region_info = kcalloc(max_regions,
46			      sizeof(struct pds_lm_dirty_region_info),
47			      GFP_KERNEL);
48	if (!region_info)
49		return;
50
51	regions_dma =
52		dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE);
53	if (dma_mapping_error(pdsc_dev, regions_dma))
54		goto out_free_region_info;
55
56	err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions,
57					&num_regions);
58	dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE);
59	if (err)
60		goto out_free_region_info;
61
62	for (unsigned int i = 0; i < num_regions; i++)
63		dev_dbg(&pdev->dev,
64			"region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n",
65			i, le64_to_cpu(region_info[i].dma_base),
66			le32_to_cpu(region_info[i].page_count),
67			region_info[i].page_size_log2);
68
69out_free_region_info:
70	kfree(region_info);
71}
72
73static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_region *region,
74					unsigned long bytes)
75{
76	unsigned long *host_seq_bmp, *host_ack_bmp;
77
78	host_seq_bmp = vzalloc(bytes);
79	if (!host_seq_bmp)
80		return -ENOMEM;
81
82	host_ack_bmp = vzalloc(bytes);
83	if (!host_ack_bmp) {
84		bitmap_free(host_seq_bmp);
85		return -ENOMEM;
86	}
87
88	region->host_seq = host_seq_bmp;
89	region->host_ack = host_ack_bmp;
90	region->bmp_bytes = bytes;
91
92	return 0;
93}
94
95static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
96{
97	if (!dirty->regions)
98		return;
99
100	for (int i = 0; i < dirty->num_regions; i++) {
101		struct pds_vfio_region *region = &dirty->regions[i];
102
103		vfree(region->host_seq);
104		vfree(region->host_ack);
105		region->host_seq = NULL;
106		region->host_ack = NULL;
107		region->bmp_bytes = 0;
108	}
109}
110
111static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
112				      struct pds_vfio_region *region)
113{
114	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
115	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
116
117	dma_unmap_single(pdsc_dev, region->sgl_addr,
118			 region->num_sge * sizeof(struct pds_lm_sg_elem),
119			 DMA_BIDIRECTIONAL);
120	kfree(region->sgl);
121
122	region->num_sge = 0;
123	region->sgl = NULL;
124	region->sgl_addr = 0;
125}
126
127static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
128{
129	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
130
131	if (!dirty->regions)
132		return;
133
134	for (int i = 0; i < dirty->num_regions; i++) {
135		struct pds_vfio_region *region = &dirty->regions[i];
136
137		if (region->sgl)
138			__pds_vfio_dirty_free_sgl(pds_vfio, region);
139	}
140}
141
142static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
143				    struct pds_vfio_region *region,
144				    u32 page_count)
145{
146	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
147	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
148	struct pds_lm_sg_elem *sgl;
149	dma_addr_t sgl_addr;
150	size_t sgl_size;
151	u32 max_sge;
152
153	max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8);
154	sgl_size = max_sge * sizeof(struct pds_lm_sg_elem);
155
156	sgl = kzalloc(sgl_size, GFP_KERNEL);
157	if (!sgl)
158		return -ENOMEM;
159
160	sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL);
161	if (dma_mapping_error(pdsc_dev, sgl_addr)) {
162		kfree(sgl);
163		return -EIO;
164	}
165
166	region->sgl = sgl;
167	region->num_sge = max_sge;
168	region->sgl_addr = sgl_addr;
169
170	return 0;
171}
172
173static void pds_vfio_dirty_free_regions(struct pds_vfio_dirty *dirty)
174{
175	vfree(dirty->regions);
176	dirty->regions = NULL;
177	dirty->num_regions = 0;
178}
179
180static int pds_vfio_dirty_alloc_regions(struct pds_vfio_pci_device *pds_vfio,
181					struct pds_lm_dirty_region_info *region_info,
182					u64 region_page_size, u8 num_regions)
183{
184	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
185	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
186	u32 dev_bmp_offset_byte = 0;
187	int err;
188
189	dirty->regions = vcalloc(num_regions, sizeof(struct pds_vfio_region));
190	if (!dirty->regions)
191		return -ENOMEM;
192	dirty->num_regions = num_regions;
193
194	for (int i = 0; i < num_regions; i++) {
195		struct pds_lm_dirty_region_info *ri = &region_info[i];
196		struct pds_vfio_region *region = &dirty->regions[i];
197		u64 region_size, region_start;
198		u32 page_count;
199
200		/* page_count might be adjusted by the device */
201		page_count = le32_to_cpu(ri->page_count);
202		region_start = le64_to_cpu(ri->dma_base);
203		region_size = page_count * region_page_size;
204
205		err = pds_vfio_dirty_alloc_bitmaps(region,
206						   page_count / BITS_PER_BYTE);
207		if (err) {
208			dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
209				ERR_PTR(err));
210			goto out_free_regions;
211		}
212
213		err = pds_vfio_dirty_alloc_sgl(pds_vfio, region, page_count);
214		if (err) {
215			dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
216				ERR_PTR(err));
217			goto out_free_regions;
218		}
219
220		region->size = region_size;
221		region->start = region_start;
222		region->page_size = region_page_size;
223		region->dev_bmp_offset_start_byte = dev_bmp_offset_byte;
224
225		dev_bmp_offset_byte += page_count / BITS_PER_BYTE;
226		if (dev_bmp_offset_byte % BITS_PER_BYTE) {
227			dev_err(&pdev->dev, "Device bitmap offset is mis-aligned\n");
228			err = -EINVAL;
229			goto out_free_regions;
230		}
231	}
232
233	return 0;
234
235out_free_regions:
236	pds_vfio_dirty_free_bitmaps(dirty);
237	pds_vfio_dirty_free_sgl(pds_vfio);
238	pds_vfio_dirty_free_regions(dirty);
239
240	return err;
241}
242
243static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
244				 struct rb_root_cached *ranges, u32 nnodes,
245				 u64 *page_size)
246{
247	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
248	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
249	struct pds_lm_dirty_region_info *region_info;
250	struct interval_tree_node *node = NULL;
251	u64 region_page_size = *page_size;
252	u8 max_regions = 0, num_regions;
253	dma_addr_t regions_dma = 0;
254	u32 num_ranges = nnodes;
255	int err;
256	u16 len;
257
258	dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
259		pds_vfio->vf_id);
260
261	if (pds_vfio_dirty_is_enabled(pds_vfio))
262		return -EINVAL;
263
264	/* find if dirty tracking is disabled, i.e. num_regions == 0 */
265	err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
266					&num_regions);
267	if (err < 0) {
268		dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
269			ERR_PTR(err));
270		return err;
271	} else if (num_regions) {
272		dev_err(&pdev->dev,
273			"Dirty tracking already enabled for %d regions\n",
274			num_regions);
275		return -EEXIST;
276	} else if (!max_regions) {
277		dev_err(&pdev->dev,
278			"Device doesn't support dirty tracking, max_regions %d\n",
279			max_regions);
280		return -EOPNOTSUPP;
281	}
282
283	if (num_ranges > max_regions) {
284		vfio_combine_iova_ranges(ranges, nnodes, max_regions);
285		num_ranges = max_regions;
286	}
287
288	region_info = kcalloc(num_ranges, sizeof(*region_info), GFP_KERNEL);
289	if (!region_info)
290		return -ENOMEM;
291	len = num_ranges * sizeof(*region_info);
292
293	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
294	if (!node)
295		return -EINVAL;
296	for (int i = 0; i < num_ranges; i++) {
297		struct pds_lm_dirty_region_info *ri = &region_info[i];
298		u64 region_size = node->last - node->start + 1;
299		u64 region_start = node->start;
300		u32 page_count;
301
302		page_count = DIV_ROUND_UP(region_size, region_page_size);
303
304		ri->dma_base = cpu_to_le64(region_start);
305		ri->page_count = cpu_to_le32(page_count);
306		ri->page_size_log2 = ilog2(region_page_size);
307
308		dev_dbg(&pdev->dev,
309			"region_info[%d]: region_start 0x%llx region_end 0x%lx region_size 0x%llx page_count %u page_size %llu\n",
310			i, region_start, node->last, region_size, page_count,
311			region_page_size);
312
313		node = interval_tree_iter_next(node, 0, ULONG_MAX);
314	}
315
316	regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
317				     DMA_BIDIRECTIONAL);
318	if (dma_mapping_error(pdsc_dev, regions_dma)) {
319		err = -ENOMEM;
320		goto out_free_region_info;
321	}
322
323	err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, num_ranges);
324	dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
325	if (err)
326		goto out_free_region_info;
327
328	err = pds_vfio_dirty_alloc_regions(pds_vfio, region_info,
329					   region_page_size, num_ranges);
330	if (err) {
331		dev_err(&pdev->dev,
332			"Failed to allocate %d regions for tracking dirty regions: %pe\n",
333			num_regions, ERR_PTR(err));
334		goto out_dirty_disable;
335	}
336
337	pds_vfio_dirty_set_enabled(pds_vfio);
338
339	pds_vfio_print_guest_region_info(pds_vfio, max_regions);
340
341	kfree(region_info);
342
343	return 0;
344
345out_dirty_disable:
346	pds_vfio_dirty_disable_cmd(pds_vfio);
347out_free_region_info:
348	kfree(region_info);
349	return err;
350}
351
352void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
353{
354	if (pds_vfio_dirty_is_enabled(pds_vfio)) {
355		pds_vfio_dirty_set_disabled(pds_vfio);
356		if (send_cmd)
357			pds_vfio_dirty_disable_cmd(pds_vfio);
358		pds_vfio_dirty_free_sgl(pds_vfio);
359		pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
360		pds_vfio_dirty_free_regions(&pds_vfio->dirty);
361	}
362
363	if (send_cmd)
364		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
365}
366
367static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
368				  struct pds_vfio_region *region,
369				  unsigned long *seq_ack_bmp, u32 offset,
370				  u32 bmp_bytes, bool read_seq)
371{
372	const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
373	u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
374	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
375	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
376	unsigned long long npages;
377	struct sg_table sg_table;
378	struct scatterlist *sg;
379	struct page **pages;
380	u32 page_offset;
381	const void *bmp;
382	size_t size;
383	u16 num_sge;
384	int err;
385	int i;
386
387	bmp = (void *)((u64)seq_ack_bmp + offset);
388	page_offset = offset_in_page(bmp);
389	bmp -= page_offset;
390
391	/*
392	 * Start and end of bitmap section to seq/ack might not be page
393	 * aligned, so use the page_offset to account for that so there
394	 * will be enough pages to represent the bmp_bytes
395	 */
396	npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
397	pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
398	if (!pages)
399		return -ENOMEM;
400
401	for (unsigned long long i = 0; i < npages; i++) {
402		struct page *page = vmalloc_to_page(bmp);
403
404		if (!page) {
405			err = -EFAULT;
406			goto out_free_pages;
407		}
408
409		pages[i] = page;
410		bmp += PAGE_SIZE;
411	}
412
413	err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
414					bmp_bytes, GFP_KERNEL);
415	if (err)
416		goto out_free_pages;
417
418	err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
419	if (err)
420		goto out_free_sg_table;
421
422	for_each_sgtable_dma_sg(&sg_table, sg, i) {
423		struct pds_lm_sg_elem *sg_elem = &region->sgl[i];
424
425		sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
426		sg_elem->len = cpu_to_le32(sg_dma_len(sg));
427	}
428
429	num_sge = sg_table.nents;
430	size = num_sge * sizeof(struct pds_lm_sg_elem);
431	offset += region->dev_bmp_offset_start_byte;
432	dma_sync_single_for_device(pdsc_dev, region->sgl_addr, size, dma_dir);
433	err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, region->sgl_addr, num_sge,
434					 offset, bmp_bytes, read_seq);
435	if (err)
436		dev_err(&pdev->dev,
437			"Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n",
438			bmp_type_str, offset, bmp_bytes,
439			num_sge, region->sgl_addr, ERR_PTR(err));
440	dma_sync_single_for_cpu(pdsc_dev, region->sgl_addr, size, dma_dir);
441
442	dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
443out_free_sg_table:
444	sg_free_table(&sg_table);
445out_free_pages:
446	kfree(pages);
447
448	return err;
449}
450
451static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
452				   struct pds_vfio_region *region,
453				    u32 offset, u32 len)
454{
455
456	return pds_vfio_dirty_seq_ack(pds_vfio, region, region->host_ack,
457				      offset, len, WRITE_ACK);
458}
459
460static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
461				   struct pds_vfio_region *region,
462				   u32 offset, u32 len)
463{
464	return pds_vfio_dirty_seq_ack(pds_vfio, region, region->host_seq,
465				      offset, len, READ_SEQ);
466}
467
468static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
469					  struct pds_vfio_region *region,
470					  struct iova_bitmap *dirty_bitmap,
471					  u32 bmp_offset, u32 len_bytes)
472{
473	u64 page_size = region->page_size;
474	u64 region_start = region->start;
475	u32 bmp_offset_bit;
476	__le64 *seq, *ack;
477	int dword_count;
478
479	dword_count = len_bytes / sizeof(u64);
480	seq = (__le64 *)((u64)region->host_seq + bmp_offset);
481	ack = (__le64 *)((u64)region->host_ack + bmp_offset);
482	bmp_offset_bit = bmp_offset * 8;
483
484	for (int i = 0; i < dword_count; i++) {
485		u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
486
487		/* prepare for next write_ack call */
488		ack[i] = seq[i];
489
490		for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
491			if (xor & BIT(bit_i)) {
492				u64 abs_bit_i = bmp_offset_bit +
493						i * BITS_PER_TYPE(u64) + bit_i;
494				u64 addr = abs_bit_i * page_size + region_start;
495
496				iova_bitmap_set(dirty_bitmap, addr, page_size);
497			}
498		}
499	}
500
501	return 0;
502}
503
504static struct pds_vfio_region *
505pds_vfio_get_region(struct pds_vfio_pci_device *pds_vfio, unsigned long iova)
506{
507	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
508
509	for (int i = 0; i < dirty->num_regions; i++) {
510		struct pds_vfio_region *region = &dirty->regions[i];
511
512		if (iova >= region->start &&
513		    iova < (region->start + region->size))
514			return region;
515	}
516
517	return NULL;
518}
519
520static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
521			       struct iova_bitmap *dirty_bitmap,
522			       unsigned long iova, unsigned long length)
523{
524	struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
525	struct pds_vfio_region *region;
526	u64 bmp_offset, bmp_bytes;
527	u64 bitmap_size, pages;
528	int err;
529
530	dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
531
532	if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
533		dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
534			pds_vfio->vf_id);
535		return -EINVAL;
536	}
537
538	region = pds_vfio_get_region(pds_vfio, iova);
539	if (!region) {
540		dev_err(dev, "vf%u: Failed to find region that contains iova 0x%lx length 0x%lx\n",
541			pds_vfio->vf_id, iova, length);
542		return -EINVAL;
543	}
544
545	pages = DIV_ROUND_UP(length, region->page_size);
546	bitmap_size =
547		round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
548
549	dev_dbg(dev,
550		"vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n",
551		pds_vfio->vf_id, iova, length, region->page_size,
552		pages, bitmap_size);
553
554	if (!length || ((iova - region->start + length) > region->size)) {
555		dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
556			iova, length);
557		return -EINVAL;
558	}
559
560	/* bitmap is modified in 64 bit chunks */
561	bmp_bytes = ALIGN(DIV_ROUND_UP(length / region->page_size,
562				       sizeof(u64)), sizeof(u64));
563	if (bmp_bytes != bitmap_size) {
564		dev_err(dev,
565			"Calculated bitmap bytes %llu not equal to bitmap size %llu\n",
566			bmp_bytes, bitmap_size);
567		return -EINVAL;
568	}
569
570	if (bmp_bytes > region->bmp_bytes) {
571		dev_err(dev,
572			"Calculated bitmap bytes %llu larger than region's cached bmp_bytes %llu\n",
573			bmp_bytes, region->bmp_bytes);
574		return -EINVAL;
575	}
576
577	bmp_offset = DIV_ROUND_UP((iova - region->start) /
578				  region->page_size, sizeof(u64));
579
580	dev_dbg(dev,
581		"Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n",
582		iova, length, bmp_offset, bmp_bytes);
583
584	err = pds_vfio_dirty_read_seq(pds_vfio, region, bmp_offset, bmp_bytes);
585	if (err)
586		return err;
587
588	err = pds_vfio_dirty_process_bitmaps(pds_vfio, region, dirty_bitmap,
589					     bmp_offset, bmp_bytes);
590	if (err)
591		return err;
592
593	err = pds_vfio_dirty_write_ack(pds_vfio, region, bmp_offset, bmp_bytes);
594	if (err)
595		return err;
596
597	return 0;
598}
599
600int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
601				unsigned long length, struct iova_bitmap *dirty)
602{
603	struct pds_vfio_pci_device *pds_vfio =
604		container_of(vdev, struct pds_vfio_pci_device,
605			     vfio_coredev.vdev);
606	int err;
607
608	mutex_lock(&pds_vfio->state_mutex);
609	err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
610	mutex_unlock(&pds_vfio->state_mutex);
611
612	return err;
613}
614
615int pds_vfio_dma_logging_start(struct vfio_device *vdev,
616			       struct rb_root_cached *ranges, u32 nnodes,
617			       u64 *page_size)
618{
619	struct pds_vfio_pci_device *pds_vfio =
620		container_of(vdev, struct pds_vfio_pci_device,
621			     vfio_coredev.vdev);
622	int err;
623
624	mutex_lock(&pds_vfio->state_mutex);
625	pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
626	err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
627	mutex_unlock(&pds_vfio->state_mutex);
628
629	return err;
630}
631
632int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
633{
634	struct pds_vfio_pci_device *pds_vfio =
635		container_of(vdev, struct pds_vfio_pci_device,
636			     vfio_coredev.vdev);
637
638	mutex_lock(&pds_vfio->state_mutex);
639	pds_vfio_dirty_disable(pds_vfio, true);
640	mutex_unlock(&pds_vfio->state_mutex);
641
642	return 0;
643}
644