1/*
2 * Custom subclass of PyUnicode_Type, to allow for transparent bridging of
3 * strings
4 */
5
6#include "pyobjc.h"
7
8#include <stddef.h>
9#include <Foundation/NSString.h>
10
11typedef struct {
12	PyUnicodeObject	base;
13	PyObject*	weakrefs;
14	id		nsstr;
15	PyObject* py_nsstr;
16} PyObjCUnicodeObject;
17
18PyDoc_STRVAR(class_doc,
19	"objc.pyobjc_unicode\n"
20	"\n"
21	"Subclass of unicode for representing NSString values. Use \n"
22	"the method nsstring to access the NSString. \n"
23	"Note that instances are immutable and won't be updated when\n"
24	"the value of the NSString changes."
25);
26
27static void
28class_dealloc(PyObject* obj)
29{
30	PyObjCUnicodeObject* uobj = (PyObjCUnicodeObject*)obj;
31	id nsstr = uobj->nsstr;
32	PyObject* weakrefs = uobj->weakrefs;
33	PyObject* py_nsstr = uobj->py_nsstr;
34
35	PyObjC_UnregisterPythonProxy(nsstr, obj);
36
37	Py_XDECREF(py_nsstr);
38	if (nsstr) {
39		CFRelease(nsstr);
40	}
41
42	if (weakrefs) {
43		PyObject_ClearWeakRefs(obj);
44	}
45
46	PyUnicode_Type.tp_dealloc(obj);
47}
48
49static PyObject*
50meth_nsstring(PyObject* self)
51{
52	PyObjCUnicodeObject* uobj = (PyObjCUnicodeObject*)self;
53	if (uobj->py_nsstr == NULL) {
54		uobj->py_nsstr = PyObjCObject_New(uobj->nsstr,
55				PyObjCObject_kDEFAULT, YES);
56	}
57	Py_INCREF(uobj->py_nsstr);
58	return uobj->py_nsstr;
59}
60
61
62static PyObject*
63meth_getattro(PyObject *o, PyObject *attr_name)
64{
65	PyObject *res;
66	res = PyObject_GenericGetAttr(o, attr_name);
67	if (res == NULL) {
68		PyErr_Clear();
69		PyObject *py_nsstr = meth_nsstring(o);
70		res = PyObject_GetAttr(py_nsstr, attr_name);
71		Py_XDECREF(py_nsstr);
72	}
73	return res;
74}
75
76static PyObject*
77meth_reduce(PyObject* self)
78{
79	PyObject* retVal = NULL;
80	PyObject *v = NULL;
81	PyObject *v2 = NULL;
82
83	retVal = PyTuple_New(2);
84	if (retVal == NULL) goto error;
85
86	v = (PyObject*)&PyUnicode_Type;
87	Py_INCREF(v);
88	PyTuple_SET_ITEM(retVal, 0, v);
89
90	v = PyUnicode_FromObject(self);
91	if (v == NULL ) goto error;
92
93	v2 = PyTuple_New(1);
94	if (v2 == NULL) goto error;
95	PyTuple_SET_ITEM(v2, 0, v);
96	PyTuple_SET_ITEM(retVal, 1, v2);
97
98	return retVal;
99
100error:
101	Py_XDECREF(retVal);
102	Py_XDECREF(v);
103	return NULL;
104}
105
106static PyMethodDef class_methods[] = {
107	{
108	  "nsstring",
109	  (PyCFunction)meth_nsstring,
110	  METH_NOARGS,
111	  "directly access NSString instance"
112	},
113	{
114	  "__reduce__",
115	  (PyCFunction)meth_reduce,
116	  METH_NOARGS,
117	  "Used for pickling"
118	},
119	{ 0, 0, 0, 0 } /* sentinel */
120};
121
122static PyObject*
123nsstring_get__pyobjc_object__(PyObject *self, void *closure __attribute__((__unused__))) {
124	return meth_nsstring(self);
125}
126
127static PyGetSetDef nsstring_getsetters[] = {
128	{
129		"__pyobjc_object__",
130		(getter)nsstring_get__pyobjc_object__, NULL,
131		"raw NSString instance",
132		NULL
133	},
134	{
135		NULL,
136		NULL, NULL,
137		NULL,
138		NULL
139	}
140};
141
142static PyObject*
143class_new(
144	PyTypeObject* type __attribute__((__unused__)),
145	PyObject* args __attribute__((__unused__)),
146	PyObject* kwds __attribute__((__unused__)))
147{
148	PyErr_SetString(PyExc_TypeError,
149			"Cannot create instances of 'objc.unicode' in Python");
150	return NULL;
151}
152
153PyTypeObject PyObjCUnicode_Type = {
154	PyVarObject_HEAD_INIT(&PyType_Type, 0)
155	"objc.pyobjc_unicode",			/* tp_name */
156	sizeof(PyObjCUnicodeObject),		/* tp_basicsize */
157	0,			 		/* tp_itemsize */
158	/* methods */
159	class_dealloc,	 			/* tp_dealloc */
160	0,					/* tp_print */
161	0,					/* tp_getattr */
162	0,					/* tp_setattr */
163	0,					/* tp_compare */
164	0,					/* tp_repr */
165	0,					/* tp_as_number */
166	0,					/* tp_as_sequence */
167	0,		       			/* tp_as_mapping */
168	0,					/* tp_hash */
169	0,					/* tp_call */
170	0,					/* tp_str */
171	meth_getattro,		/* tp_getattro */
172	0,					/* tp_setattro */
173	0,					/* tp_as_buffer */
174	Py_TPFLAGS_DEFAULT,			/* tp_flags */
175 	class_doc,				/* tp_doc */
176 	0,					/* tp_traverse */
177 	0,					/* tp_clear */
178	0,					/* tp_richcompare */
179	offsetof(PyObjCUnicodeObject, weakrefs),	/* tp_weaklistoffset */
180	0,					/* tp_iter */
181	0,					/* tp_iternext */
182	class_methods,				/* tp_methods */
183	0,					/* tp_members */
184	nsstring_getsetters,			/* tp_getset */
185	&PyUnicode_Type,			/* tp_base */
186	0,					/* tp_dict */
187	0,					/* tp_descr_get */
188	0,					/* tp_descr_set */
189	0,					/* tp_dictoffset */
190	0,					/* tp_init */
191	0,					/* tp_alloc */
192	class_new,				/* tp_new */
193	0,		        		/* tp_free */
194	0,					/* tp_is_gc */
195	0,                                      /* tp_bases */
196	0,                                      /* tp_mro */
197	0,                                      /* tp_cache */
198	0,                                      /* tp_subclasses */
199	0,                                      /* tp_weaklist */
200	0                                       /* tp_del */
201#if PY_VERSION_HEX >= 0x02060000
202	, 0                                     /* tp_version_tag */
203#endif
204
205};
206
207
208#if PY_VERSION_HEX >= 0x03030000
209   /*
210    * Python 3.3 introduced a new, more efficient representation
211    * for unicode objects.
212    *
213    * This function cannot use the most efficient
214    * representation where the character data is stored in the same
215    * memory block as the object header because PyObjCUnicode adds
216    * more data to the object header, which PyUnicode does not
217    * expect.
218    *
219    * This function therefore creates a "legacy string, ready" (see
220    * unicodeobject.h in the python 3.3 source tree for more information)
221    *
222    *
223    * XXX: I'm not very happy about this implementation, it is too verbose
224    *      and seems to be even more fragile than the implementation for
225    *      older python versions.
226    */
227PyObject*
228PyObjCUnicode_New(NSString* value)
229{
230	PyObjCUnicodeObject* result;
231        PyASCIIObject *ascii;
232        PyCompactUnicodeObject *compact;
233
234	NSInteger i, length;
235	unichar* volatile characters = NULL;
236	NSRange range;
237
238	PyObjC_DURING
239		length = [value length];
240		characters = PyObject_MALLOC(sizeof(unichar) * (length+1));
241		if (characters == NULL) {
242			PyErr_NoMemory();
243			NS_VALUERETURN(NULL, PyObject*);
244		}
245
246		range = NSMakeRange(0, length);
247
248		[value getCharacters: characters range: range];
249		characters[length] = 0;
250
251	PyObjC_HANDLER
252		if (characters) {
253			PyMem_Free(characters);
254			characters = NULL;
255		}
256		PyObjCErr_FromObjC(localException);
257		NS_VALUERETURN(NULL, PyObject*);
258	PyObjC_ENDHANDLER
259
260	result = PyObject_New(PyObjCUnicodeObject, &PyObjCUnicode_Type);
261	ascii = (PyASCIIObject*)result;
262	compact = (PyCompactUnicodeObject*)result;
263
264	ascii->hash = -1;
265	ascii->wstr = NULL;
266	ascii->length = length;
267
268	ascii->state.compact = 0;
269	ascii->state.ready = 1;
270	ascii->state.interned = SSTATE_NOT_INTERNED;
271
272	compact->utf8_length = 0;
273	compact->utf8 = NULL;
274	compact->wstr_length = 0;
275
276	result->base.data.any = NULL;
277
278	Py_UCS4 maxchar = 0;
279	int nr_surrogates = 0;
280	for (i = 0; i < length; i++) {
281		Py_UCS4 cur = (Py_UCS4)characters[i];
282		if (Py_UNICODE_IS_HIGH_SURROGATE(cur) && (
283			i < length - 1) && (
284			Py_UNICODE_IS_LOW_SURROGATE(characters[i+1]))) {
285			Py_UCS4 ch = Py_UNICODE_JOIN_SURROGATES(
286				characters[i],
287				characters[i+1]);
288			i++;
289			nr_surrogates++;
290			if (ch > maxchar) {
291				maxchar = ch;
292			}
293		} else if (cur > maxchar) {
294			maxchar = cur;
295		}
296	}
297	if (maxchar <= 128) {
298		ascii->state.ascii = 1;
299		ascii->state.kind = PyUnicode_1BYTE_KIND;
300	} else if (maxchar <= 255) {
301		ascii->state.ascii = 0;
302		ascii->state.kind = PyUnicode_1BYTE_KIND;
303	} else if (maxchar <= 0xFFFF) {
304		ascii->state.ascii = 0;
305		ascii->state.kind = PyUnicode_2BYTE_KIND;
306	} else {
307		ascii->state.ascii = 0;
308		ascii->state.kind = PyUnicode_4BYTE_KIND;
309	}
310
311	/* Create storage for the code points and copy the data */
312	result->base.data.any = NULL;
313	if (ascii->state.kind == PyUnicode_1BYTE_KIND) {
314		result->base.data.latin1 = PyObject_MALLOC(sizeof(Py_UCS1) * (length + 1 - nr_surrogates));
315		if (result->base.data.latin1 == NULL) {
316			Py_DECREF((PyObject*)result);
317			PyMem_Free(characters); characters = NULL;
318			PyErr_NoMemory();
319			return NULL;
320		}
321		Py_UCS1* latin1_cur = result->base.data.latin1;
322		for (i = 0; i < length; i++) {
323			if (Py_UNICODE_IS_HIGH_SURROGATE(characters[i]) && (
324				i < length - 1) && (
325				Py_UNICODE_IS_LOW_SURROGATE(characters[i+1]))) {
326				Py_UCS4 ch = Py_UNICODE_JOIN_SURROGATES(
327					characters[i],
328					characters[i+1]);
329				*latin1_cur++ =  (Py_UCS1)ch;
330				i++;
331			} else {
332				*latin1_cur++ =  (Py_UCS1)characters[i];
333			}
334		}
335		*latin1_cur = 0;
336		ascii->length = length - nr_surrogates;
337		if (ascii->state.ascii) {
338			/* With ASCII representation the UTF8 representation is
339			 * also known without further calculation, and MUST be
340			 * filled according to the spec
341			 */
342			compact->utf8_length = length - nr_surrogates;
343			compact->utf8 = (char*)result->base.data.latin1;
344		}
345
346	} else if (ascii->state.kind == PyUnicode_2BYTE_KIND) {
347		if (nr_surrogates == 0) {
348			/* No surrogates and 2BYTE_KIND, this means the unichar buffer
349			 * can be reused as storage for the python unicode string
350			 */
351			ascii->length = length;
352			result->base.data.ucs2 = (Py_UCS2*)characters;
353			characters = NULL;
354
355		} else {
356			result->base.data.ucs2 = PyObject_MALLOC(sizeof(Py_UCS2) * (length + 1 - nr_surrogates));
357			if (result->base.data.ucs2 == NULL) {
358				Py_DECREF((PyObject*)result);
359				PyMem_Free(characters); characters = NULL;
360				PyErr_NoMemory();
361				return NULL;
362			}
363			Py_UCS2* ucs2_cur = result->base.data.ucs2;
364			for (i = 0; i < length; i++) {
365				if (Py_UNICODE_IS_HIGH_SURROGATE(characters[i]) && (
366					i < length - 1) && (
367					Py_UNICODE_IS_LOW_SURROGATE(characters[i+1]))) {
368					Py_UCS4 ch = Py_UNICODE_JOIN_SURROGATES(
369						characters[i],
370						characters[i+1]);
371					*ucs2_cur++ =  (Py_UCS2)ch;
372					i++;
373				} else {
374					*ucs2_cur++ =  (Py_UCS2)characters[i];
375				}
376			}
377			ascii->length = length - nr_surrogates;
378			*ucs2_cur = 0;
379		}
380#if SIZEOF_WCHAR_T == 2
381		ascii->wstr = (wchar_t*)(result->base.data.ucs4);
382		compact->wstr_length = ascii->length;
383#endif
384
385	} else { /* 4BYTE_KIND */
386		result->base.data.ucs4 = PyObject_MALLOC(sizeof(Py_UCS4) * (length + 1 - nr_surrogates));
387		if (result->base.data.ucs4 == NULL) {
388			Py_DECREF((PyObject*)result);
389			PyMem_Free(characters); characters = NULL;
390			PyErr_NoMemory();
391			return NULL;
392		}
393
394		Py_UCS4* ucs4_cur = result->base.data.ucs4;
395		for (i = 0; i < length; i++) {
396			if (Py_UNICODE_IS_HIGH_SURROGATE(characters[i]) && (
397				i < length - 1) && (
398				Py_UNICODE_IS_LOW_SURROGATE(characters[i+1]))) {
399				Py_UCS4 ch = Py_UNICODE_JOIN_SURROGATES(
400					characters[i],
401					characters[i+1]);
402
403				if (ch > 0x10ffff) {
404					/* Unicode spec has a maximum code point value and
405					 * Python 3.3 enforces this, keep surrogate pair
406					 * to avoid an error.
407					 */
408					*ucs4_cur++ =  (Py_UCS4)characters[i];
409				} else {
410					*ucs4_cur++ =  (Py_UCS4)ch;
411					i++;
412				}
413			} else {
414				*ucs4_cur++ =  (Py_UCS4)characters[i];
415			}
416		}
417		*ucs4_cur = 0;
418		ascii->length = length - nr_surrogates;
419#if SIZEOF_WCHAR_T == 4
420		ascii->wstr = (wchar_t*)(result->base.data.ucs4);
421		compact->wstr_length = ascii->length;
422#endif
423	}
424
425
426	if (characters != NULL) {
427		PyObject_DEL(characters);
428		characters = NULL;
429	}
430
431
432#ifdef Py_DEBUG
433	/* Check that the unicode object is correct */
434	_PyUnicode_CheckConsistency((PyObject*)result, 1);
435#endif
436
437	/* Finally store PyUnicode specific data */
438	result->weakrefs = NULL;
439	result->py_nsstr = NULL;
440	result->nsstr = value;
441	CFRetain(value);
442
443	return (PyObject*)result;
444}
445
446#else /* Python 3.2 and before */
447PyObject*
448PyObjCUnicode_New(NSString* value)
449{
450	/* Conversion to PyUnicode without creating an autoreleased object.
451	 *
452	 * NOTE: A final optimization is removing the copy of 'characters', but
453	 * that can only be done when sizeof(unichar) == Py_UNICODE_SIZE.
454	 *
455	 * The reason for doing this: NSThread
456	 *     +detachNewThreadSelector:toTarget:withObject:, with a string
457	 *     as one of the arguments.
458	 *
459	 * Another reason is that the following loop 'leaks' memory when using
460	 * -UTF8String:
461	 *  	while True:
462	 *  		NSString.alloc().init()
463	 *
464	 *  while the following doesn't:
465	 *
466	 *  	while True:
467	 *  		NSArray.alloc().init()
468	 */
469	PyObjCUnicodeObject* result;
470
471#ifdef PyObjC_UNICODE_FAST_PATH
472	Py_ssize_t length = [value length];
473	NSRange range;
474
475	if (length < 0) {
476		PyErr_SetString(PyExc_SystemError, "string with negative length");
477		return NULL;
478	}
479	result = PyObject_New(PyObjCUnicodeObject, &PyObjCUnicode_Type);
480	Py_UNICODE* tptr = PyObject_MALLOC(sizeof(Py_UNICODE) * (length+1));
481	tptr[0] = tptr[length] = 0;
482	result->base.str = tptr;
483	/*PyUnicode_AS_UNICODE(result) = tptr;*/
484	tptr = NULL;
485
486	if (PyUnicode_AS_UNICODE(result) == NULL) {
487		Py_DECREF((PyObject*)result);
488		PyErr_NoMemory();
489		return NULL;
490	}
491	range = NSMakeRange(0, length);
492	[value getCharacters:(unichar *)PyUnicode_AS_UNICODE(result) range:range];
493	/*PyUnicode_GET_SIZE(result) = length;*/
494	result->base.length = length;
495#else
496	int i, length;
497	unichar* volatile characters = NULL;
498	NSRange range;
499
500	PyObjC_DURING
501		length = [value length];
502		characters = PyMem_Malloc(sizeof(unichar) * length);
503		if (characters == NULL) {
504			PyErr_NoMemory();
505			NS_VALUERETURN(NULL, PyObject*);
506		}
507
508		range = NSMakeRange(0, length);
509
510		[value getCharacters: characters range: range];
511
512	PyObjC_HANDLER
513		if (characters) {
514			PyMem_Free(characters);
515			characters = NULL;
516		}
517		PyObjCErr_FromObjC(localException);
518		NS_VALUERETURN(NULL, PyObject*);
519	PyObjC_ENDHANDLER
520
521	result = PyObject_New(PyObjCUnicodeObject, &PyObjCUnicode_Type);
522	Py_UNICODE* tptr = PyObject_MALLOC(sizeof(Py_UNICODE) * (length+1));
523	tptr[0] = tptr[length] = 0;
524	result->base.str = tptr;
525	if (PyUnicode_AS_UNICODE(result) == NULL) {
526		Py_DECREF((PyObject*)result);
527		PyMem_Free(characters); characters = NULL;
528		PyErr_NoMemory();
529		return NULL;
530	}
531	/*PyUnicode_GET_SIZE(result) = length;*/
532	result->base.length = length;
533	for (i = 0; i < length; i++) {
534		PyUnicode_AS_UNICODE(result)[i] = (Py_UNICODE)(characters[i]);
535	}
536	PyMem_Free(characters); characters = NULL;
537#endif
538
539
540	result->base.hash = -1;
541#if PY_MAJOR_VERSION == 3
542	result->base.state = 0;
543#endif
544	result->base.defenc = NULL;
545
546	if (PyUnicode_GET_SIZE(result) == 0) {
547		result->base.hash = 0;
548	}
549
550	result->weakrefs = NULL;
551	result->py_nsstr = NULL;
552	result->nsstr = value;
553	CFRetain(value);
554
555	return (PyObject*)result;
556}
557#endif /* Python 3.2 and before */
558
559NSString*
560PyObjCUnicode_Extract(PyObject* value)
561{
562	if (!PyObjCUnicode_Check(value)) {
563		PyErr_BadInternalCall();
564		return NULL;
565	}
566
567	return ((PyObjCUnicodeObject*)value)->nsstr;
568}
569