Deleted Added
full compact
encoding.c (191739) encoding.c (226048)
1/*
2 * Copyright (c) Ian F. Darwin 1986-1995.
3 * Software written by Ian F. Darwin and others;
4 * maintained 1995-present by Christos Zoulas and others.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:

--- 21 unchanged lines hidden (view full) ---

30 *
31 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
32 * international characters.
33 */
34
35#include "file.h"
36
37#ifndef lint
1/*
2 * Copyright (c) Ian F. Darwin 1986-1995.
3 * Software written by Ian F. Darwin and others;
4 * maintained 1995-present by Christos Zoulas and others.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:

--- 21 unchanged lines hidden (view full) ---

30 *
31 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
32 * international characters.
33 */
34
35#include "file.h"
36
37#ifndef lint
38FILE_RCSID("@(#)$File: encoding.c,v 1.3 2009/02/03 20:27:51 christos Exp $")
38FILE_RCSID("@(#)$File: encoding.c,v 1.5 2010/07/21 16:47:17 christos Exp $")
39#endif /* lint */
40
41#include "magic.h"
42#include <string.h>
43#include <memory.h>
44#include <stdlib.h>
45
46
47private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
48private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
49 size_t *);
50private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
51private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
52private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
53private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
54
39#endif /* lint */
40
41#include "magic.h"
42#include <string.h>
43#include <memory.h>
44#include <stdlib.h>
45
46
47private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
48private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
49 size_t *);
50private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
51private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
52private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
53private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
54
55#ifdef DEBUG_ENCODING
56#define DPRINTF(a) printf a
57#else
58#define DPRINTF(a)
59#endif
60
55/*
56 * Try to determine whether text is in some character code we can
57 * identify. Each of these tests, if it succeeds, will leave
58 * the text converted into one-unichar-per-character Unicode in
59 * ubuf, and the number of characters converted in ulen.
60 */
61protected int
62file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)

--- 10 unchanged lines hidden (view full) ---

73 mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
74 if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
75 file_oomem(ms, mlen);
76 goto done;
77 }
78
79 *type = "text";
80 if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
61/*
62 * Try to determine whether text is in some character code we can
63 * identify. Each of these tests, if it succeeds, will leave
64 * the text converted into one-unichar-per-character Unicode in
65 * ubuf, and the number of characters converted in ulen.
66 */
67protected int
68file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)

--- 10 unchanged lines hidden (view full) ---

79 mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
80 if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
81 file_oomem(ms, mlen);
82 goto done;
83 }
84
85 *type = "text";
86 if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
87 DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
81 *code = "ASCII";
82 *code_mime = "us-ascii";
83 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
88 *code = "ASCII";
89 *code_mime = "us-ascii";
90 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
91 DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
84 *code = "UTF-8 Unicode (with BOM)";
85 *code_mime = "utf-8";
86 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
92 *code = "UTF-8 Unicode (with BOM)";
93 *code_mime = "utf-8";
94 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
95 DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
96 *code = "UTF-8 Unicode (with BOM)";
87 *code = "UTF-8 Unicode";
88 *code_mime = "utf-8";
89 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
90 if (ucs_type == 1) {
91 *code = "Little-endian UTF-16 Unicode";
92 *code_mime = "utf-16le";
93 } else {
94 *code = "Big-endian UTF-16 Unicode";
95 *code_mime = "utf-16be";
96 }
97 *code = "UTF-8 Unicode";
98 *code_mime = "utf-8";
99 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
100 if (ucs_type == 1) {
101 *code = "Little-endian UTF-16 Unicode";
102 *code_mime = "utf-16le";
103 } else {
104 *code = "Big-endian UTF-16 Unicode";
105 *code_mime = "utf-16be";
106 }
107 DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
97 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
108 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
109 DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
98 *code = "ISO-8859";
99 *code_mime = "iso-8859-1";
100 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
110 *code = "ISO-8859";
111 *code_mime = "iso-8859-1";
112 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
113 DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
101 *code = "Non-ISO extended-ASCII";
102 *code_mime = "unknown-8bit";
103 } else {
104 from_ebcdic(buf, nbytes, nbuf);
105
106 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
114 *code = "Non-ISO extended-ASCII";
115 *code_mime = "unknown-8bit";
116 } else {
117 from_ebcdic(buf, nbytes, nbuf);
118
119 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
120 DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
107 *code = "EBCDIC";
108 *code_mime = "ebcdic";
109 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
121 *code = "EBCDIC";
122 *code_mime = "ebcdic";
123 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
124 DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
125 *ulen));
110 *code = "International EBCDIC";
111 *code_mime = "ebcdic";
112 } else { /* Doesn't look like text at all */
126 *code = "International EBCDIC";
127 *code_mime = "ebcdic";
128 } else { /* Doesn't look like text at all */
129 DPRINTF(("binary\n"));
113 rv = 0;
114 *type = "binary";
115 }
116 }
117
118 done:
119 if (nbuf)
120 free(nbuf);

--- 364 unchanged lines hidden ---
130 rv = 0;
131 *type = "binary";
132 }
133 }
134
135 done:
136 if (nbuf)
137 free(nbuf);

--- 364 unchanged lines hidden ---