1 /*
2 tre-python.c - TRE Python language bindings
3
4 This sotfware is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 The original version of this code was contributed by
8 Nikolai Saoukh <nms+python@otdel1.org>.
9
10 */
11
12
13 #include "Python.h"
14 #include "structmember.h"
15
16 #include <tre/tre.h>
17
18 #define TRE_MODULE "tre"
19
20 typedef struct {
21 PyObject_HEAD
22 regex_t rgx;
23 int flags;
24 } TrePatternObject;
25
26 typedef struct {
27 PyObject_HEAD
28 regaparams_t ap;
29 } TreFuzzynessObject;
30
31 typedef struct {
32 PyObject_HEAD
33 regamatch_t am;
34 PyObject *targ; /* string we matched against */
35 TreFuzzynessObject *fz; /* fuzzyness used during match */
36 } TreMatchObject;
37
38
39 static PyObject *ErrorObject;
40
41 static void
_set_tre_err(int rc,regex_t * rgx)42 _set_tre_err(int rc, regex_t *rgx)
43 {
44 PyObject *errval;
45 char emsg[256];
46 size_t elen;
47
48 elen = tre_regerror(rc, rgx, emsg, sizeof(emsg));
49 if (emsg[elen] == '\0')
50 elen--;
51 errval = Py_BuildValue("s#", emsg, elen);
52 PyErr_SetObject(ErrorObject, errval);
53 Py_XDECREF(errval);
54 }
55
56 static PyObject *
TreFuzzyness_new(PyTypeObject * type,PyObject * args,PyObject * kwds)57 TreFuzzyness_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
58 {
59 static char *kwlist[] = {
60 "delcost", "inscost", "maxcost", "subcost",
61 "maxdel", "maxerr", "maxins", "maxsub",
62 NULL
63 };
64
65 TreFuzzynessObject *self;
66
67 self = (TreFuzzynessObject*)type->tp_alloc(type, 0);
68 if (self == NULL)
69 return NULL;
70 tre_regaparams_default(&self->ap);
71 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iiiiiiii", kwlist,
72 &self->ap.cost_del, &self->ap.cost_ins,
73 &self->ap.max_cost, &self->ap.cost_subst,
74 &self->ap.max_del, &self->ap.max_err,
75 &self->ap.max_ins, &self->ap.max_subst))
76 {
77 Py_DECREF(self);
78 return NULL;
79 }
80 return (PyObject*)self;
81 }
82
83 static PyObject *
TreFuzzyness_repr(PyObject * obj)84 TreFuzzyness_repr(PyObject *obj)
85 {
86 TreFuzzynessObject *self = (TreFuzzynessObject*)obj;
87 PyObject *o;
88
89 o = PyString_FromFormat("%s(delcost=%d,inscost=%d,maxcost=%d,subcost=%d,"
90 "maxdel=%d,maxerr=%d,maxins=%d,maxsub=%d)",
91 self->ob_type->tp_name, self->ap.cost_del,
92 self->ap.cost_ins, self->ap.max_cost,
93 self->ap.cost_subst, self->ap.max_del,
94 self->ap.max_err, self->ap.max_ins,
95 self->ap.max_subst);
96 return o;
97 }
98
99 static PyMemberDef TreFuzzyness_members[] = {
100 { "delcost", T_INT, offsetof(TreFuzzynessObject, ap.cost_del), 0,
101 "The cost of a deleted character" },
102 { "inscost", T_INT, offsetof(TreFuzzynessObject, ap.cost_ins), 0,
103 "The cost of an inserted character" },
104 { "maxcost", T_INT, offsetof(TreFuzzynessObject, ap.max_cost), 0,
105 "The maximum allowed cost of a match. If this is set to zero, an exact "
106 "match is searched for" },
107 { "subcost", T_INT, offsetof(TreFuzzynessObject, ap.cost_subst), 0,
108 "The cost of a substituted character" },
109 { "maxdel", T_INT, offsetof(TreFuzzynessObject, ap.max_del), 0,
110 "Maximum allowed number of deleted characters" },
111 { "maxerr", T_INT, offsetof(TreFuzzynessObject, ap.max_err), 0,
112 "Maximum allowed number of errors (inserts + deletes + substitutes)" },
113 { "maxins", T_INT, offsetof(TreFuzzynessObject, ap.max_ins), 0,
114 "Maximum allowed number of inserted characters" },
115 { "maxsub", T_INT, offsetof(TreFuzzynessObject, ap.max_subst), 0,
116 "Maximum allowed number of substituted characters" },
117 { NULL }
118 };
119
120 static PyTypeObject TreFuzzynessType = {
121 PyObject_HEAD_INIT(NULL)
122 0, /* ob_size */
123 TRE_MODULE ".Fuzzyness", /* tp_name */
124 sizeof(TreFuzzynessObject), /* tp_basicsize */
125 0, /* tp_itemsize */
126 /* methods */
127 0, /* tp_dealloc */
128 0, /* tp_print */
129 0, /* tp_getattr */
130 0, /* tp_setattr */
131 0, /* tp_compare */
132 TreFuzzyness_repr, /* tp_repr */
133 0, /* tp_as_number */
134 0, /* tp_as_sequence */
135 0, /* tp_as_mapping */
136 0, /* tp_hash */
137 0, /* tp_call */
138 0, /* tp_str */
139 0, /* tp_getattro */
140 0, /* tp_setattro */
141 0, /* tp_as_buffer */
142 Py_TPFLAGS_DEFAULT, /* tp_flags */
143 /* tp_doc */
144 TRE_MODULE ".fuzzyness object holds approximation parameters for match",
145 0, /* tp_traverse */
146 0, /* tp_clear */
147 0, /* tp_richcompare */
148 0, /* tp_weaklistoffset */
149 0, /* tp_iter */
150 0, /* tp_iternext */
151 0, /* tp_methods */
152 TreFuzzyness_members, /* tp_members */
153 0, /* tp_getset */
154 0, /* tp_base */
155 0, /* tp_dict */
156 0, /* tp_descr_get */
157 0, /* tp_descr_set */
158 0, /* tp_dictoffset */
159 0, /* tp_init */
160 0, /* tp_alloc */
161 TreFuzzyness_new /* tp_new */
162 };
163
164 static PyObject *
PyTreMatch_groups(TreMatchObject * self,PyObject * dummy)165 PyTreMatch_groups(TreMatchObject *self, PyObject *dummy)
166 {
167 PyObject *result;
168 size_t i;
169
170 if (self->am.nmatch < 1)
171 {
172 Py_INCREF(Py_None);
173 return Py_None;
174 }
175 result = PyTuple_New(self->am.nmatch);
176 for (i = 0; i < self->am.nmatch; i++)
177 {
178 PyObject *range;
179 regmatch_t *rm = &self->am.pmatch[i];
180
181 if (rm->rm_so == (-1) && rm->rm_eo == (-1))
182 {
183 Py_INCREF(Py_None);
184 range = Py_None;
185 }
186 else
187 {
188 range = Py_BuildValue("(ii)", rm->rm_so, rm->rm_eo);
189 }
190 PyTuple_SetItem(result, i, range);
191 }
192 return (PyObject*)result;
193 }
194
195 static PyObject *
PyTreMatch_groupi(PyObject * obj,int gn)196 PyTreMatch_groupi(PyObject *obj, int gn)
197 {
198 TreMatchObject *self = (TreMatchObject*)obj;
199 PyObject *result;
200 regmatch_t *rm;
201
202 if (gn < 0 || (size_t)gn > self->am.nmatch - 1)
203 {
204 PyErr_SetString(PyExc_ValueError, "out of bounds");
205 return NULL;
206 }
207 rm = &self->am.pmatch[gn];
208 if (rm->rm_so == (-1) && rm->rm_eo == (-1))
209 {
210 Py_INCREF(Py_None);
211 return Py_None;
212 }
213 result = PySequence_GetSlice(self->targ, rm->rm_so, rm->rm_eo);
214 return result;
215 }
216
217 static PyObject *
PyTreMatch_group(TreMatchObject * self,PyObject * grpno)218 PyTreMatch_group(TreMatchObject *self, PyObject *grpno)
219 {
220 PyObject *result;
221 long gn;
222
223 gn = PyInt_AsLong(grpno);
224
225 if (PyErr_Occurred())
226 return NULL;
227
228 result = PyTreMatch_groupi((PyObject*)self, gn);
229 return result;
230 }
231
232 static PyMethodDef TreMatch_methods[] = {
233 {"group", (PyCFunction)PyTreMatch_group, METH_O,
234 "return submatched string or None if a parenthesized subexpression did "
235 "not participate in a match"},
236 {"groups", (PyCFunction)PyTreMatch_groups, METH_NOARGS,
237 "return the tuple of slice tuples for all parenthesized subexpressions "
238 "(None for not participated)"},
239 {NULL, NULL}
240 };
241
242 static PyMemberDef TreMatch_members[] = {
243 { "cost", T_INT, offsetof(TreMatchObject, am.cost), READONLY,
244 "Cost of the match" },
245 { "numdel", T_INT, offsetof(TreMatchObject, am.num_del), READONLY,
246 "Number of deletes in the match" },
247 { "numins", T_INT, offsetof(TreMatchObject, am.num_ins), READONLY,
248 "Number of inserts in the match" },
249 { "numsub", T_INT, offsetof(TreMatchObject, am.num_subst), READONLY,
250 "Number of substitutes in the match" },
251 { "fuzzyness", T_OBJECT, offsetof(TreMatchObject, fz), READONLY,
252 "Fuzzyness used during match" },
253 { NULL }
254 };
255
256 static void
PyTreMatch_dealloc(TreMatchObject * self)257 PyTreMatch_dealloc(TreMatchObject *self)
258 {
259 Py_XDECREF(self->targ);
260 Py_XDECREF(self->fz);
261 if (self->am.pmatch != NULL)
262 PyMem_Del(self->am.pmatch);
263 PyObject_Del(self);
264 }
265
266 static PySequenceMethods TreMatch_as_sequence_methods = {
267 0, /* sq_length */
268 0, /* sq_concat */
269 0, /* sq_repeat */
270 PyTreMatch_groupi, /* sq_item */
271 0, /* sq_slice */
272 0, /* sq_ass_item */
273 0, /* sq_ass_slice */
274 0, /* sq_contains */
275 0, /* sq_inplace_concat */
276 0 /* sq_inplace_repeat */
277 };
278
279 static PyTypeObject TreMatchType = {
280 PyObject_HEAD_INIT(NULL)
281 0, /* ob_size */
282 TRE_MODULE ".Match", /* tp_name */
283 sizeof(TreMatchObject), /* tp_basicsize */
284 0, /* tp_itemsize */
285 /* methods */
286 (destructor)PyTreMatch_dealloc, /* tp_dealloc */
287 0, /* tp_print */
288 0, /* tp_getattr */
289 0, /* tp_setattr */
290 0, /* tp_compare */
291 0, /* tp_repr */
292 0, /* tp_as_number */
293 &TreMatch_as_sequence_methods, /* tp_as_sequence */
294 0, /* tp_as_mapping */
295 0, /* tp_hash */
296 0, /* tp_call */
297 0, /* tp_str */
298 0, /* tp_getattro */
299 0, /* tp_setattro */
300 0, /* tp_as_buffer */
301 Py_TPFLAGS_DEFAULT, /* tp_flags */
302 TRE_MODULE ".match object holds result of successful match", /* tp_doc */
303 0, /* tp_traverse */
304 0, /* tp_clear */
305 0, /* tp_richcompare */
306 0, /* tp_weaklistoffset */
307 0, /* tp_iter */
308 0, /* tp_iternext */
309 TreMatch_methods, /* tp_methods */
310 TreMatch_members /* tp_members */
311 };
312
313 static TreMatchObject *
newTreMatchObject(void)314 newTreMatchObject(void)
315 {
316 TreMatchObject *self;
317
318 self = PyObject_New(TreMatchObject, &TreMatchType);
319 if (self == NULL)
320 return NULL;
321 memset(&self->am, '\0', sizeof(self->am));
322 self->targ = NULL;
323 self->fz = NULL;
324 return self;
325 }
326
327 static PyObject *
PyTrePattern_search(TrePatternObject * self,PyObject * args)328 PyTrePattern_search(TrePatternObject *self, PyObject *args)
329 {
330 PyObject *pstring;
331 int eflags = 0;
332 TreMatchObject *mo;
333 TreFuzzynessObject *fz;
334 size_t nsub;
335 int rc;
336 regmatch_t *pm;
337 char *targ;
338 size_t tlen;
339
340 if (PyTuple_Size(args) > 0 && PyUnicode_Check(PyTuple_GetItem(args, 0)))
341 {
342 if (!PyArg_ParseTuple(args, "UO!|i:search", &pstring, &TreFuzzynessType,
343 &fz, &eflags))
344 return NULL;
345 }
346 else
347 {
348 if (!PyArg_ParseTuple(args, "SO!|i:search", &pstring, &TreFuzzynessType,
349 &fz, &eflags))
350 return NULL;
351 }
352
353 mo = newTreMatchObject();
354 if (mo == NULL)
355 return NULL;
356
357 nsub = self->rgx.re_nsub + 1;
358 pm = PyMem_New(regmatch_t, nsub);
359 if (!pm)
360 {
361 Py_DECREF(mo);
362 return PyErr_NoMemory();
363 }
364
365 mo->am.nmatch = nsub;
366 mo->am.pmatch = pm;
367
368 if (PyUnicode_Check(pstring))
369 {
370 Py_ssize_t len = PyUnicode_GetSize(pstring);
371 wchar_t *buf = calloc(sizeof(wchar_t), len);
372 if(!buf)
373 {
374 Py_DECREF(mo);
375 return PyErr_NoMemory();
376 }
377 PyUnicode_AsWideChar(pstring, buf, len);
378 rc = tre_regawnexec(&self->rgx, buf, len, &mo->am, fz->ap, eflags);
379 free(buf);
380 }
381 else
382 {
383 targ = PyString_AsString(pstring);
384 tlen = PyString_Size(pstring);
385
386 rc = tre_reganexec(&self->rgx, targ, tlen, &mo->am, fz->ap, eflags);
387 }
388
389 if (PyErr_Occurred())
390 {
391 Py_DECREF(mo);
392 return NULL;
393 }
394
395 if (rc == REG_OK)
396 {
397 Py_INCREF(pstring);
398 mo->targ = pstring;
399 Py_INCREF(fz);
400 mo->fz = fz;
401 return (PyObject*)mo;
402 }
403
404 if (rc == REG_NOMATCH)
405 {
406 Py_DECREF(mo);
407 Py_INCREF(Py_None);
408 return Py_None;
409 }
410 _set_tre_err(rc, &self->rgx);
411 Py_DECREF(mo);
412 return NULL;
413 }
414
415 static PyMethodDef TrePattern_methods[] = {
416 { "search", (PyCFunction)PyTrePattern_search, METH_VARARGS,
417 "try to search in the given string, returning " TRE_MODULE ".match object "
418 "or None on failure" },
419 {NULL, NULL}
420 };
421
422 static PyMemberDef TrePattern_members[] = {
423 { "nsub", T_INT, offsetof(TrePatternObject, rgx.re_nsub), READONLY,
424 "Number of parenthesized subexpressions in regex" },
425 { NULL }
426 };
427
428 static void
PyTrePattern_dealloc(TrePatternObject * self)429 PyTrePattern_dealloc(TrePatternObject *self)
430 {
431 tre_regfree(&self->rgx);
432 PyObject_Del(self);
433 }
434
435 static PyTypeObject TrePatternType = {
436 PyObject_HEAD_INIT(NULL)
437 0, /* ob_size */
438 TRE_MODULE ".Pattern", /* tp_name */
439 sizeof(TrePatternObject), /* tp_basicsize */
440 0, /* tp_itemsize */
441 /* methods */
442 (destructor)PyTrePattern_dealloc, /*tp_dealloc*/
443 0, /* tp_print */
444 0, /* tp_getattr */
445 0, /* tp_setattr */
446 0, /* tp_compare */
447 0, /* tp_repr */
448 0, /* tp_as_number */
449 0, /* tp_as_sequence */
450 0, /* tp_as_mapping */
451 0, /* tp_hash */
452 0, /* tp_call */
453 0, /* tp_str */
454 0, /* tp_getattro */
455 0, /* tp_setattro */
456 0, /* tp_as_buffer */
457 Py_TPFLAGS_DEFAULT, /* tp_flags */
458 TRE_MODULE ".pattern object holds compiled tre regex", /* tp_doc */
459 0, /* tp_traverse */
460 0, /* tp_clear */
461 0, /* tp_richcompare */
462 0, /* tp_weaklistoffset */
463 0, /* tp_iter */
464 0, /* tp_iternext */
465 TrePattern_methods, /* tp_methods */
466 TrePattern_members /* tp_members */
467 };
468
469 static TrePatternObject *
newTrePatternObject()470 newTrePatternObject()
471 {
472 TrePatternObject *self;
473
474 self = PyObject_New(TrePatternObject, &TrePatternType);
475 if (self == NULL)
476 return NULL;
477 self->flags = 0;
478 return self;
479 }
480
481 static PyObject *
PyTre_ncompile(PyObject * self,PyObject * args)482 PyTre_ncompile(PyObject *self, PyObject *args)
483 {
484 TrePatternObject *rv;
485 PyUnicodeObject *upattern = NULL;
486 char *pattern = NULL;
487 int pattlen;
488 int cflags = 0;
489 int rc;
490
491 if (PyTuple_Size(args) > 0 && PyUnicode_Check(PyTuple_GetItem(args, 0)))
492 {
493 if (!PyArg_ParseTuple(args, "U|i:compile", &upattern, &cflags))
494 return NULL;
495 }
496 else
497 {
498 if (!PyArg_ParseTuple(args, "s#|i:compile", &pattern, &pattlen, &cflags))
499 return NULL;
500 }
501
502 rv = newTrePatternObject();
503 if (rv == NULL)
504 return NULL;
505
506 if (upattern != NULL)
507 {
508 Py_ssize_t len = PyUnicode_GetSize(upattern);
509 wchar_t *buf = calloc(sizeof(wchar_t), len);
510 if(!buf)
511 {
512 Py_DECREF(rv);
513 return PyErr_NoMemory();
514 }
515 PyUnicode_AsWideChar(upattern, buf, len);
516 rc = tre_regwncomp(&rv->rgx, buf, len, cflags);
517 free(buf);
518 }
519 else
520 rc = tre_regncomp(&rv->rgx, (char*)pattern, pattlen, cflags);
521
522 if (rc != REG_OK)
523 {
524 if (!PyErr_Occurred())
525 _set_tre_err(rc, &rv->rgx);
526 Py_DECREF(rv);
527 return NULL;
528 }
529 rv->flags = cflags;
530 return (PyObject*)rv;
531 }
532
533 static PyMethodDef tre_methods[] = {
534 { "compile", PyTre_ncompile, METH_VARARGS,
535 "Compile a regular expression pattern, returning a "
536 TRE_MODULE ".pattern object" },
537 { NULL, NULL }
538 };
539
540 static char *tre_doc =
541 "Python module for TRE library\n\nModule exports "
542 "the only function: compile";
543
544 static struct _tre_flags {
545 char *name;
546 int val;
547 } tre_flags[] = {
548 { "EXTENDED", REG_EXTENDED },
549 { "ICASE", REG_ICASE },
550 { "NEWLINE", REG_NEWLINE },
551 { "NOSUB", REG_NOSUB },
552 { "LITERAL", REG_LITERAL },
553
554 { "NOTBOL", REG_NOTBOL },
555 { "NOTEOL", REG_NOTEOL },
556 { NULL, 0 }
557 };
558
559 PyMODINIT_FUNC
inittre(void)560 inittre(void)
561 {
562 PyObject *m;
563 struct _tre_flags *fp;
564
565 if (PyType_Ready(&TreFuzzynessType) < 0)
566 return;
567 if (PyType_Ready(&TreMatchType) < 0)
568 return;
569 if (PyType_Ready(&TrePatternType) < 0)
570 return;
571
572 /* Create the module and add the functions */
573 m = Py_InitModule3(TRE_MODULE, tre_methods, tre_doc);
574 if (m == NULL)
575 return;
576
577 Py_INCREF(&TreFuzzynessType);
578 if (PyModule_AddObject(m, "Fuzzyness", (PyObject*)&TreFuzzynessType) < 0)
579 return;
580 Py_INCREF(&TreMatchType);
581 if (PyModule_AddObject(m, "Match", (PyObject*)&TreMatchType) < 0)
582 return;
583 Py_INCREF(&TrePatternType);
584 if (PyModule_AddObject(m, "Pattern", (PyObject*)&TrePatternType) < 0)
585 return;
586 ErrorObject = PyErr_NewException(TRE_MODULE ".Error", NULL, NULL);
587 Py_INCREF(ErrorObject);
588 if (PyModule_AddObject(m, "Error", ErrorObject) < 0)
589 return;
590
591 /* Insert the flags */
592 for (fp = tre_flags; fp->name != NULL; fp++)
593 if (PyModule_AddIntConstant(m, fp->name, fp->val) < 0)
594 return;
595 }
596