1298178Sdelphij
2298178Sdelphij#------------------------------------------------------------------------------
3354939Sdelphij# $File: bioinformatics,v 1.5 2019/04/19 00:42:27 christos Exp $
4298178Sdelphij# bioinfomatics:  file(1) magic for Bioinfomatics file formats
5298178Sdelphij
6298178Sdelphij###############################################################################
7298178Sdelphij# BGZF (Blocked GNU Zip Format) - gzip compatible, but also indexable
8298178Sdelphij# used by SAMtools bgzip/tabix (http://samtools.sourceforge.net/tabix.shtml)
9298178Sdelphij###############################################################################
10298178Sdelphij0	string		\037\213
11298178Sdelphij>3	byte		&0x04
12298178Sdelphij>>12	string		BC
13298178Sdelphij>>>14	leshort		&0x02	Blocked GNU Zip Format (BGZF; gzip compatible)
14298178Sdelphij>>>>16	leshort		x	\b, block length %d
15298178Sdelphij!:mime	application/x-gzip
16298178Sdelphij
17298178Sdelphij
18298178Sdelphij###############################################################################
19309847Sdelphij# Tabix index file
20298178Sdelphij# used by SAMtools bgzip/tabix (http://samtools.sourceforge.net/tabix.shtml)
21298178Sdelphij###############################################################################
22298178Sdelphij0	string	TBI\1		SAMtools TBI (Tabix index format)
23298178Sdelphij>0x04	lelong	=1		\b, with %d reference sequence
24298178Sdelphij>0x04	lelong	>1		\b, with %d reference sequences
25298178Sdelphij>0x08	lelong	&0x10000	\b, using half-closed-half-open coordinates (BED style)
26309847Sdelphij>0x08	lelong	^0x10000
27298178Sdelphij>>0x08	lelong	=0		\b, using closed and one based coordinates (GFF style)
28298178Sdelphij>>0x08	lelong	=1		\b, using SAM format
29298178Sdelphij>>0x08	lelong	=2		\b, using VCF format
30298178Sdelphij>0x0c	lelong	x		\b, sequence name column: %d
31298178Sdelphij>0x10	lelong	x		\b, region start column: %d
32309847Sdelphij>0x08	lelong	=0
33298178Sdelphij>>0x14	lelong	x		\b, region end column: %d
34298178Sdelphij>0x18	byte	x		\b, comment character: %c
35298178Sdelphij>0x1c	lelong	x		\b, skip line count: %d
36298178Sdelphij
37298178Sdelphij
38298178Sdelphij###############################################################################
39309847Sdelphij# BAM (Binary Sequence Alignment/Map format)
40309847Sdelphij# used by SAMtools (http://samtools.sourceforge.net/SAM1.pdf)
41298178Sdelphij# data is normally present only within compressed BGZF blocks (CDATA), so use file -z to examine it
42298178Sdelphij###############################################################################
43298178Sdelphij0	string	BAM\1	SAMtools BAM (Binary Sequence Alignment/Map)
44309847Sdelphij>0x04	lelong	>0
45298178Sdelphij>>&0x00 regex	=^[@]HD\t.*VN:		\b, with SAM header
46298178Sdelphij>>>&0	regex	=[0-9.]+		\b version %s
47298178Sdelphij>>&(0x04)	lelong	>0	\b, with %d reference sequences
48298178Sdelphij
49298178Sdelphij
50298178Sdelphij###############################################################################
51298178Sdelphij# BAI (BAM indexing format)
52309847Sdelphij# used by SAMtools (http://samtools.sourceforge.net/SAM1.pdf)
53298178Sdelphij###############################################################################
54298178Sdelphij0		string	BAI\1	SAMtools BAI (BAM indexing format)
55298178Sdelphij>0x04		lelong	>0	\b, with %d reference sequences
56298178Sdelphij
57298178Sdelphij
58298178Sdelphij###############################################################################
59309847Sdelphij# CRAM (Binary Sequence Alignment/Map format)
60298178Sdelphij###############################################################################
61298178Sdelphij0	string	CRAM	CRAM
62298178Sdelphij>0x04	byte	>-1	version %d.
63298178Sdelphij>0x05	byte	>-1	\b%d
64298178Sdelphij>0x06	string	>\0	(identified as %s)
65298178Sdelphij
66298178Sdelphij
67298178Sdelphij###############################################################################
68298178Sdelphij# BCF (Binary Call Format), version 1
69298178Sdelphij# used by SAMtools & VCFtools (http://vcftools.sourceforge.net/bcf.pdf)
70298178Sdelphij# data is normally present only within compressed BGZF blocks (CDATA), so use file -z to examine it
71298178Sdelphij###############################################################################
72309847Sdelphij0		string	   BCF\4
73298178Sdelphij# length of seqnm data in bytes is positive
74309847Sdelphij>&0x00		lelong	  >0
75298178Sdelphij# length of smpl data in bytes is positive
76298178Sdelphij>>&(&-0x04)	lelong	  >0			SAMtools BCF (Binary Call Format)
77298178Sdelphij# length of meta in bytes
78309847Sdelphij>>>&(&-0x04)	lelong	  >0
79298178Sdelphij# have meta text string
80298178Sdelphij>>>>&0x00	search	  ##samtoolsVersion=
81298178Sdelphij>>>>>&0x00	string	  x			\b, generated by SAMtools version %s
82298178Sdelphij
83298178Sdelphij
84298178Sdelphij###############################################################################
85298178Sdelphij# BCF (Binary Call Format), version 2.1
86354939Sdelphij# used by SAMtools (https://samtools.github.io/hts-specs/BCFv2_qref.pdf)
87298178Sdelphij# data is normally present only within compressed BGZF blocks (CDATA), so use file -z to examine it
88298178Sdelphij###############################################################################
89298178Sdelphij0		string	   BCF\2\1    Binary Call Format (BCF) version 2.1
90298178Sdelphij# length of header text
91309847Sdelphij>&0x00		lelong	  >0
92298178Sdelphij# have header string
93298178Sdelphij>>&0x00 search	  ##samtoolsVersion=
94298178Sdelphij>>>&0x00	string	  x			\b, generated by SAMtools version %s
95298178Sdelphij
96298178Sdelphij
97298178Sdelphij###############################################################################
98298178Sdelphij# BCF (Binary Call Format), version 2.2
99354939Sdelphij# used by SAMtools (https://samtools.github.io/hts-specs/BCFv2_qref.pdf)
100298178Sdelphij# data is normally present only within compressed BGZF blocks (CDATA), so use file -z to examine it
101298178Sdelphij###############################################################################
102298178Sdelphij0		string	   BCF\2\2    Binary Call Format (BCF) version 2.2
103298178Sdelphij# length of header text
104309847Sdelphij>&0x00		lelong	  >0
105298178Sdelphij# have header string
106298178Sdelphij>>&0x00 search	  ##samtoolsVersion=
107298178Sdelphij>>>&0x00	string	  x			\b, generated by SAMtools version %s
108298178Sdelphij
109298178Sdelphij###############################################################################
110298178Sdelphij# VCF (Variant Call Format)
111298178Sdelphij# used by VCFtools (http://vcftools.sourceforge.net/)
112298178Sdelphij###############################################################################
113298178Sdelphij0      search	   ##fileformat=VCFv	Variant Call Format (VCF)
114298178Sdelphij>&0    string	   x			\b version %s
115298178Sdelphij
116298178Sdelphij###############################################################################
117298178Sdelphij# FASTQ
118298178Sdelphij# used by MAQ (http://maq.sourceforge.net/fastq.shtml)
119298178Sdelphij###############################################################################
120298178Sdelphij# XXX Broken?
121298178Sdelphij# @<seqname>
122309847Sdelphij#0	regex	=^@[A-Za-z0-9_.:-]+\?\n
123298178Sdelphij# <seq>
124298178Sdelphij#>&1	regex	=^[A-Za-z\n.~]++
125298178Sdelphij# +[<seqname>]
126309847Sdelphij#>>&1	regex	=^[A-Za-z0-9_.:-]*\?\n
127298178Sdelphij# <qual>
128298178Sdelphij#>>>&1	regex	=^[!-~\n]+\n		FASTQ
129298178Sdelphij
130298178Sdelphij###############################################################################
131298178Sdelphij# FASTA
132354939Sdelphij# used by FASTA (https://fasta.bioch.virginia.edu/fasta_www2/fasta_guide.pdf)
133298178Sdelphij###############################################################################
134298178Sdelphij#0	byte	0x3e
135309847Sdelphij# q>0	regex	=^[>][!-~\t\ ]+$
136298178Sdelphij# Amino Acid codes: [A-IK-Z*-]+
137298178Sdelphij#>>1	regex	!=[!-'Jj;:=?@^`|~\\]		FASTA
138298178Sdelphij# IUPAC codes/gaps: [ACGTURYKMSWBDHVNX-]+
139298178Sdelphij# not in IUPAC codes/gaps: [EFIJLOPQZ]
140298178Sdelphij#>>>1	regex	!=[EFIJLOPQZefijlopqz]		\b, with IUPAC nucleotide codes
141298178Sdelphij#>>>1	regex	=^[EFIJLOPQZefijlopqz]+$	\b, with Amino Acid codes
142298178Sdelphij
143298178Sdelphij###############################################################################
144309847Sdelphij# SAM (Sequence Alignment/Map format)
145309847Sdelphij# used by SAMtools (http://samtools.sourceforge.net/SAM1.pdf)
146298178Sdelphij###############################################################################
147298178Sdelphij# Short-cut version to recognise SAM files with (optional) header at beginning
148298178Sdelphij###############################################################################
149309847Sdelphij0      string	   @HD\t
150298178Sdelphij>4     search	   VN:		Sequence Alignment/Map (SAM), with header
151298178Sdelphij>>&0   regex	   [0-9.]+	\b version %s
152298178Sdelphij###############################################################################
153298178Sdelphij# Longer version to recognise SAM alignment lines using (many) regexes
154298178Sdelphij###############################################################################
155298178Sdelphij# SAM Alignment QNAME
156309847Sdelphij0		regex	=^[!-?A-~]{1,255}(\t[^\t]+){11}
157298178Sdelphij# SAM Alignment FLAG
158309847Sdelphij>0		regex	=^([^\t]+\t){1}[0-9]{1,5}\t
159298178Sdelphij# SAM Alignment RNAME
160309847Sdelphij>>0		regex	=^([^\t]+\t){2}\\*|[^*=]*\t
161298178Sdelphij# SAM Alignment POS
162309847Sdelphij>>>0		regex	=^([^\t]+\t){3}[0-9]{1,9}\t
163298178Sdelphij# SAM Alignment MAPQ
164309847Sdelphij>>>>0		regex	=^([^\t]+\t){4}[0-9]{1,3}\t
165298178Sdelphij# SAM Alignment CIGAR
166309847Sdelphij>>>>>0		regex	=\t(\\*|([0-9]+[MIDNSHPX=])+)\t
167298178Sdelphij# SAM Alignment RNEXT
168309847Sdelphij>>>>>>0		regex	=\t(\\*|=|[!-()+->?-~][!-~]*)\t
169298178Sdelphij# SAM Alignment PNEXT
170309847Sdelphij>>>>>>>0	regex	=^([^\t]+\t){7}[0-9]{1,9}\t
171298178Sdelphij# SAM Alignment TLEN
172309847Sdelphij>>>>>>>>0	regex	=\t[+-]{0,1}[0-9]{1,9}\t.*\t
173298178Sdelphij# SAM Alignment SEQ
174309847Sdelphij>>>>>>>>>0	regex	=^([^\t]+\t){9}(\\*|[A-Za-z=.]+)\t
175298178Sdelphij# SAM Alignment QUAL
176298178Sdelphij>>>>>>>>>>0	regex	=^([^\t]+\t){10}[!-~]+	Sequence Alignment/Map (SAM)
177298178Sdelphij>>>>>>>>>>>0	regex	=^[@]HD\t.*VN:		\b, with header
178298178Sdelphij>>>>>>>>>>>>&0	regex	=[0-9.]+		\b version %s
179