encoding.c (191739) | encoding.c (226048) |
---|---|
1/* 2 * Copyright (c) Ian F. Darwin 1986-1995. 3 * Software written by Ian F. Darwin and others; 4 * maintained 1995-present by Christos Zoulas and others. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: --- 21 unchanged lines hidden (view full) --- 30 * 31 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 32 * international characters. 33 */ 34 35#include "file.h" 36 37#ifndef lint | 1/* 2 * Copyright (c) Ian F. Darwin 1986-1995. 3 * Software written by Ian F. Darwin and others; 4 * maintained 1995-present by Christos Zoulas and others. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: --- 21 unchanged lines hidden (view full) --- 30 * 31 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 32 * international characters. 33 */ 34 35#include "file.h" 36 37#ifndef lint |
38FILE_RCSID("@(#)$File: encoding.c,v 1.3 2009/02/03 20:27:51 christos Exp $") | 38FILE_RCSID("@(#)$File: encoding.c,v 1.5 2010/07/21 16:47:17 christos Exp $") |
39#endif /* lint */ 40 41#include "magic.h" 42#include <string.h> 43#include <memory.h> 44#include <stdlib.h> 45 46 47private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); 48private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, 49 size_t *); 50private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); 51private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); 52private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); 53private void from_ebcdic(const unsigned char *, size_t, unsigned char *); 54 | 39#endif /* lint */ 40 41#include "magic.h" 42#include <string.h> 43#include <memory.h> 44#include <stdlib.h> 45 46 47private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); 48private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, 49 size_t *); 50private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); 51private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); 52private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); 53private void from_ebcdic(const unsigned char *, size_t, unsigned char *); 54 |
55#ifdef DEBUG_ENCODING 56#define DPRINTF(a) printf a 57#else 58#define DPRINTF(a) 59#endif 60 |
|
55/* 56 * Try to determine whether text is in some character code we can 57 * identify. Each of these tests, if it succeeds, will leave 58 * the text converted into one-unichar-per-character Unicode in 59 * ubuf, and the number of characters converted in ulen. 60 */ 61protected int 62file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) --- 10 unchanged lines hidden (view full) --- 73 mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 74 if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) { 75 file_oomem(ms, mlen); 76 goto done; 77 } 78 79 *type = "text"; 80 if (looks_ascii(buf, nbytes, *ubuf, ulen)) { | 61/* 62 * Try to determine whether text is in some character code we can 63 * identify. Each of these tests, if it succeeds, will leave 64 * the text converted into one-unichar-per-character Unicode in 65 * ubuf, and the number of characters converted in ulen. 66 */ 67protected int 68file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) --- 10 unchanged lines hidden (view full) --- 79 mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 80 if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) { 81 file_oomem(ms, mlen); 82 goto done; 83 } 84 85 *type = "text"; 86 if (looks_ascii(buf, nbytes, *ubuf, ulen)) { |
87 DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); |
|
81 *code = "ASCII"; 82 *code_mime = "us-ascii"; 83 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { | 88 *code = "ASCII"; 89 *code_mime = "us-ascii"; 90 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { |
91 DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); |
|
84 *code = "UTF-8 Unicode (with BOM)"; 85 *code_mime = "utf-8"; 86 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { | 92 *code = "UTF-8 Unicode (with BOM)"; 93 *code_mime = "utf-8"; 94 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { |
95 DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); 96 *code = "UTF-8 Unicode (with BOM)"; |
|
87 *code = "UTF-8 Unicode"; 88 *code_mime = "utf-8"; 89 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 90 if (ucs_type == 1) { 91 *code = "Little-endian UTF-16 Unicode"; 92 *code_mime = "utf-16le"; 93 } else { 94 *code = "Big-endian UTF-16 Unicode"; 95 *code_mime = "utf-16be"; 96 } | 97 *code = "UTF-8 Unicode"; 98 *code_mime = "utf-8"; 99 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 100 if (ucs_type == 1) { 101 *code = "Little-endian UTF-16 Unicode"; 102 *code_mime = "utf-16le"; 103 } else { 104 *code = "Big-endian UTF-16 Unicode"; 105 *code_mime = "utf-16be"; 106 } |
107 DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); |
|
97 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { | 108 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { |
109 DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); |
|
98 *code = "ISO-8859"; 99 *code_mime = "iso-8859-1"; 100 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { | 110 *code = "ISO-8859"; 111 *code_mime = "iso-8859-1"; 112 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { |
113 DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); |
|
101 *code = "Non-ISO extended-ASCII"; 102 *code_mime = "unknown-8bit"; 103 } else { 104 from_ebcdic(buf, nbytes, nbuf); 105 106 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { | 114 *code = "Non-ISO extended-ASCII"; 115 *code_mime = "unknown-8bit"; 116 } else { 117 from_ebcdic(buf, nbytes, nbuf); 118 119 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { |
120 DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); |
|
107 *code = "EBCDIC"; 108 *code_mime = "ebcdic"; 109 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { | 121 *code = "EBCDIC"; 122 *code_mime = "ebcdic"; 123 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { |
124 DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", 125 *ulen)); |
|
110 *code = "International EBCDIC"; 111 *code_mime = "ebcdic"; 112 } else { /* Doesn't look like text at all */ | 126 *code = "International EBCDIC"; 127 *code_mime = "ebcdic"; 128 } else { /* Doesn't look like text at all */ |
129 DPRINTF(("binary\n")); |
|
113 rv = 0; 114 *type = "binary"; 115 } 116 } 117 118 done: 119 if (nbuf) 120 free(nbuf); --- 364 unchanged lines hidden --- | 130 rv = 0; 131 *type = "binary"; 132 } 133 } 134 135 done: 136 if (nbuf) 137 free(nbuf); --- 364 unchanged lines hidden --- |