1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * (C) Copyright 2015 Hewlett Packard Enterprise Development LP<BR>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include "regparse.h"
33 #include "st.h"
34 
35 #define WARN_BUFSIZE    256
36 
37 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
38 
39 
40 OnigSyntaxType OnigSyntaxRuby = {
41   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
42      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
43      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
44      ONIG_SYN_OP_ESC_C_CONTROL )
45    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
46   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
47       ONIG_SYN_OP2_OPTION_RUBY |
48       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
49       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
50       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
51       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
52       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
53       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
54       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
55       ONIG_SYN_OP2_ESC_H_XDIGIT )
56   , ( SYN_GNU_REGEX_BV |
57       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
58       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
59       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
60       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
61       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
62       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
63       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
64   , ONIG_OPTION_NONE
65   ,
66   {
67       (OnigCodePoint )'\\'                       /* esc */
68     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
69     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
70     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
71     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
72     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
73   }
74 };
75 
76 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
77 
onig_null_warn(const char * s ARG_UNUSED)78 extern void onig_null_warn(const char* s ARG_UNUSED) { }
79 
80 #ifdef DEFAULT_WARN_FUNCTION
81 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
82 #else
83 static OnigWarnFunc onig_warn = onig_null_warn;
84 #endif
85 
86 #ifdef DEFAULT_VERB_WARN_FUNCTION
87 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
88 #else
89 static OnigWarnFunc onig_verb_warn = onig_null_warn;
90 #endif
91 
onig_set_warn_func(OnigWarnFunc f)92 extern void onig_set_warn_func(OnigWarnFunc f)
93 {
94   onig_warn = f;
95 }
96 
onig_set_verb_warn_func(OnigWarnFunc f)97 extern void onig_set_verb_warn_func(OnigWarnFunc f)
98 {
99   onig_verb_warn = f;
100 }
101 
102 static void
bbuf_free(BBuf * bbuf)103 bbuf_free(BBuf* bbuf)
104 {
105   if (IS_NOT_NULL(bbuf)) {
106     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
107     xfree(bbuf);
108   }
109 }
110 
111 static int
bbuf_clone(BBuf ** rto,BBuf * from)112 bbuf_clone(BBuf** rto, BBuf* from)
113 {
114   int r;
115   BBuf *to;
116 
117   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
118   CHECK_NULL_RETURN_MEMERR(to);
119   r = BBUF_INIT(to, from->alloc);
120   if (r != 0) return r;
121   to->used = from->used;
122   xmemcpy(to->p, from->p, from->used);
123   return 0;
124 }
125 
126 #define BACKREF_REL_TO_ABS(rel_no, env) \
127   ((env)->num_mem + 1 + (rel_no))
128 
129 #define ONOFF(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
130 
131 #define MBCODE_START_POS(enc) \
132   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
133 
134 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
135   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
136 
137 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
138   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
139     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
140     if (r) return r;\
141   }\
142 } while (0)
143 
144 
145 #define BITSET_IS_EMPTY(bs,empty) do {\
146   int i;\
147   empty = 1;\
148   for (i = 0; i < (int )BITSET_SIZE; i++) {\
149     if ((bs)[i] != 0) {\
150       empty = 0; break;\
151     }\
152   }\
153 } while (0)
154 
155 static void
bitset_set_range(BitSetRef bs,int from,int to)156 bitset_set_range(BitSetRef bs, int from, int to)
157 {
158   int i;
159   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
160     BITSET_SET_BIT(bs, i);
161   }
162 }
163 
164 #if 0
165 static void
166 bitset_set_all(BitSetRef bs)
167 {
168   int i;
169   for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
170 }
171 #endif
172 
173 static void
bitset_invert(BitSetRef bs)174 bitset_invert(BitSetRef bs)
175 {
176   int i;
177   for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
178 }
179 
180 static void
bitset_invert_to(BitSetRef from,BitSetRef to)181 bitset_invert_to(BitSetRef from, BitSetRef to)
182 {
183   int i;
184   for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
185 }
186 
187 static void
bitset_and(BitSetRef dest,BitSetRef bs)188 bitset_and(BitSetRef dest, BitSetRef bs)
189 {
190   int i;
191   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
192 }
193 
194 static void
bitset_or(BitSetRef dest,BitSetRef bs)195 bitset_or(BitSetRef dest, BitSetRef bs)
196 {
197   int i;
198   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
199 }
200 
201 static void
bitset_copy(BitSetRef dest,BitSetRef bs)202 bitset_copy(BitSetRef dest, BitSetRef bs)
203 {
204   int i;
205   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
206 }
207 
208 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)209 onig_strncmp(const UChar* s1, const UChar* s2, int n)
210 {
211   int x;
212 
213   while (n-- > 0) {
214     x = *s2++ - *s1++;
215     if (x) return x;
216   }
217   return 0;
218 }
219 
220 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)221 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
222 {
223   int len = (int)(end - src);
224   if (len > 0) {
225     xmemcpy(dest, src, len);
226     dest[len] = (UChar )0;
227   }
228 }
229 
230 #ifdef USE_NAMED_GROUP
231 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)232 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
233 {
234   int slen, term_len, i;
235   UChar *r;
236 
237   slen = (int)(end - s);
238   term_len = ONIGENC_MBC_MINLEN(enc);
239 
240   r = (UChar* )xmalloc(slen + term_len);
241   CHECK_NULL_RETURN(r);
242   xmemcpy(r, s, slen);
243 
244   for (i = 0; i < term_len; i++)
245     r[slen + i] = (UChar )0;
246 
247   return r;
248 }
249 #endif
250 
251 /* scan pattern methods */
252 #define PEND_VALUE   0
253 
254 #define PFETCH_READY  UChar* pfetch_prev
255 #define PEND         (p < end ?  0 : 1)
256 #define PUNFETCH     p = pfetch_prev
257 #define PINC       do { \
258   pfetch_prev = p; \
259   p += ONIGENC_MBC_ENC_LEN(enc, p); \
260 } while (0)
261 #define PFETCH(c)  do { \
262   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
263   pfetch_prev = p; \
264   p += ONIGENC_MBC_ENC_LEN(enc, p); \
265 } while (0)
266 
267 #define PINC_S     do { \
268   p += ONIGENC_MBC_ENC_LEN(enc, p); \
269 } while (0)
270 #define PFETCH_S(c) do { \
271   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
272   p += ONIGENC_MBC_ENC_LEN(enc, p); \
273 } while (0)
274 
275 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
276 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
277 
278 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa,int oldCapa)279 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
280 	      int capa, int oldCapa)
281 {
282   UChar* r;
283 
284   if (dest)
285     r = (UChar* )xrealloc(dest, capa + 1, oldCapa);
286   else
287     r = (UChar* )xmalloc(capa + 1);
288 
289   CHECK_NULL_RETURN(r);
290   onig_strcpy(r + (dest_end - dest), src, src_end);
291   return r;
292 }
293 
294 /* dest on static area */
295 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)296 strcat_capa_from_static(UChar* dest, UChar* dest_end,
297 			const UChar* src, const UChar* src_end, int capa)
298 {
299   UChar* r;
300 
301   r = (UChar* )xmalloc(capa + 1);
302   CHECK_NULL_RETURN(r);
303   onig_strcpy(r, dest, dest_end);
304   onig_strcpy(r + (dest_end - dest), src, src_end);
305   return r;
306 }
307 
308 
309 #ifdef USE_ST_LIBRARY
310 
311 typedef struct {
312   UChar* s;
313   UChar* end;
314 } st_str_end_key;
315 
316 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)317 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
318 {
319   UChar *p, *q;
320   int c;
321 
322   if ((x->end - x->s) != (y->end - y->s))
323     return 1;
324 
325   p = x->s;
326   q = y->s;
327   while (p < x->end) {
328     c = (int )*p - (int )*q;
329     if (c != 0) return c;
330 
331     p++; q++;
332   }
333 
334   return 0;
335 }
336 
337 static int
str_end_hash(st_str_end_key * x)338 str_end_hash(st_str_end_key* x)
339 {
340   UChar *p;
341   int val = 0;
342 
343   p = x->s;
344   while (p < x->end) {
345     val = val * 997 + (int )*p++;
346   }
347 
348   return val + (val >> 5);
349 }
350 
351 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)352 onig_st_init_strend_table_with_size(int size)
353 {
354   static struct st_hash_type hashType = {
355     str_end_cmp,
356     str_end_hash,
357   };
358 
359   return (hash_table_type* )
360            onig_st_init_table_with_size(&hashType, size);
361 }
362 
363 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)364 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
365 		      const UChar* end_key, hash_data_type *value)
366 {
367   st_str_end_key key;
368 
369   key.s   = (UChar* )str_key;
370   key.end = (UChar* )end_key;
371 
372   return onig_st_lookup(table, (st_data_t )(UINTN)(&key), value);
373 }
374 
375 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)376 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
377 		      const UChar* end_key, hash_data_type value)
378 {
379   st_str_end_key* key;
380   int result;
381 
382   key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
383   CHECK_NULL_RETURN_MEMERR(key);
384   key->s   = (UChar* )str_key;
385   key->end = (UChar* )end_key;
386   result = onig_st_insert(table, (st_data_t )(UINTN)key, value);
387   if (result) {
388     xfree(key);
389   }
390   return result;
391 }
392 
393 #endif /* USE_ST_LIBRARY */
394 
395 
396 #ifdef USE_NAMED_GROUP
397 
398 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
399 
400 typedef struct {
401   UChar* name;
402   int    name_len;   /* byte length */
403   int    back_num;   /* number of backrefs */
404   int    back_alloc;
405   int    back_ref1;
406   int*   back_refs;
407 } NameEntry;
408 
409 #ifdef USE_ST_LIBRARY
410 
411 typedef st_table  NameTable;
412 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
413 
414 #define NAMEBUF_SIZE    24
415 #define NAMEBUF_SIZE_1  25
416 
417 #ifdef ONIG_DEBUG
418 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)419 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
420 {
421   int i;
422   FILE* fp = (FILE* )arg;
423 
424   fprintf(fp, "%s: ", e->name);
425   if (e->back_num == 0)
426     fputs("-", fp);
427   else if (e->back_num == 1)
428     fprintf(fp, "%d", e->back_ref1);
429   else {
430     for (i = 0; i < e->back_num; i++) {
431       if (i > 0) fprintf(fp, ", ");
432       fprintf(fp, "%d", e->back_refs[i]);
433     }
434   }
435   fputs("\n", fp);
436   return ST_CONTINUE;
437 }
438 
439 extern int
onig_print_names(FILE * fp,regex_t * reg)440 onig_print_names(FILE* fp, regex_t* reg)
441 {
442   NameTable* t = (NameTable* )reg->name_table;
443 
444   if (IS_NOT_NULL(t)) {
445     fprintf(fp, "name table\n");
446     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
447     fputs("\n", fp);
448   }
449   return 0;
450 }
451 #endif /* ONIG_DEBUG */
452 
453 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)454 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
455 {
456   xfree(e->name);
457   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
458   xfree(key);
459   xfree(e);
460   return ST_DELETE;
461 }
462 
463 static int
names_clear(regex_t * reg)464 names_clear(regex_t* reg)
465 {
466   NameTable* t = (NameTable* )reg->name_table;
467 
468   if (IS_NOT_NULL(t)) {
469     onig_st_foreach(t, i_free_name_entry, 0);
470   }
471   return 0;
472 }
473 
474 extern int
onig_names_free(regex_t * reg)475 onig_names_free(regex_t* reg)
476 {
477   int r;
478   NameTable* t;
479 
480   r = names_clear(reg);
481   if (r) return r;
482 
483   t = (NameTable* )reg->name_table;
484   if (IS_NOT_NULL(t)) onig_st_free_table(t);
485   reg->name_table = (void* )NULL;
486   return 0;
487 }
488 
489 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)490 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
491 {
492   NameEntry* e;
493   NameTable* t = (NameTable* )reg->name_table;
494 
495   e = (NameEntry* )NULL;
496   if (IS_NOT_NULL(t)) {
497     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
498   }
499   return e;
500 }
501 
502 typedef struct {
503   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
504   regex_t* reg;
505   void* arg;
506   int ret;
507   OnigEncoding enc;
508 } INamesArg;
509 
510 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)511 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
512 {
513   int r = (*(arg->func))(e->name,
514                          e->name + e->name_len,
515                          e->back_num,
516 			 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
517 			 arg->reg, arg->arg);
518   if (r != 0) {
519     arg->ret = r;
520     return ST_STOP;
521   }
522   return ST_CONTINUE;
523 }
524 
525 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)526 onig_foreach_name(regex_t* reg,
527   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
528 {
529   INamesArg narg;
530   NameTable* t = (NameTable* )reg->name_table;
531 
532   narg.ret = 0;
533   if (IS_NOT_NULL(t)) {
534     narg.func = func;
535     narg.reg  = reg;
536     narg.arg  = arg;
537     narg.enc  = reg->enc; /* should be pattern encoding. */
538     onig_st_foreach(t, i_names, (HashDataType )(UINTN)&narg);
539   }
540   return narg.ret;
541 }
542 
543 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)544 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
545 {
546   int i;
547 
548   if (e->back_num > 1) {
549     for (i = 0; i < e->back_num; i++) {
550       e->back_refs[i] = map[e->back_refs[i]].new_val;
551     }
552   }
553   else if (e->back_num == 1) {
554     e->back_ref1 = map[e->back_ref1].new_val;
555   }
556 
557   return ST_CONTINUE;
558 }
559 
560 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)561 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
562 {
563   NameTable* t = (NameTable* )reg->name_table;
564 
565   if (IS_NOT_NULL(t)) {
566     onig_st_foreach(t, i_renumber_name, (HashDataType )(UINTN)map);
567   }
568   return 0;
569 }
570 
571 
572 extern int
onig_number_of_names(regex_t * reg)573 onig_number_of_names(regex_t* reg)
574 {
575   NameTable* t = (NameTable* )reg->name_table;
576 
577   if (IS_NOT_NULL(t))
578     return t->num_entries;
579   else
580     return 0;
581 }
582 
583 #else  /* USE_ST_LIBRARY */
584 
585 #define INIT_NAMES_ALLOC_NUM    8
586 
587 typedef struct {
588   NameEntry* e;
589   int        num;
590   int        alloc;
591 } NameTable;
592 
593 #ifdef ONIG_DEBUG
594 extern int
onig_print_names(FILE * fp,regex_t * reg)595 onig_print_names(FILE* fp, regex_t* reg)
596 {
597   int i, j;
598   NameEntry* e;
599   NameTable* t = (NameTable* )reg->name_table;
600 
601   if (IS_NOT_NULL(t) && t->num > 0) {
602     fprintf(fp, "name table\n");
603     for (i = 0; i < t->num; i++) {
604       e = &(t->e[i]);
605       fprintf(fp, "%s: ", e->name);
606       if (e->back_num == 0) {
607 	fputs("-", fp);
608       }
609       else if (e->back_num == 1) {
610 	fprintf(fp, "%d", e->back_ref1);
611       }
612       else {
613 	for (j = 0; j < e->back_num; j++) {
614 	  if (j > 0) fprintf(fp, ", ");
615 	  fprintf(fp, "%d", e->back_refs[j]);
616 	}
617       }
618       fputs("\n", fp);
619     }
620     fputs("\n", fp);
621   }
622   return 0;
623 }
624 #endif
625 
626 static int
names_clear(regex_t * reg)627 names_clear(regex_t* reg)
628 {
629   int i;
630   NameEntry* e;
631   NameTable* t = (NameTable* )reg->name_table;
632 
633   if (IS_NOT_NULL(t)) {
634     for (i = 0; i < t->num; i++) {
635       e = &(t->e[i]);
636       if (IS_NOT_NULL(e->name)) {
637 	xfree(e->name);
638 	e->name       = NULL;
639 	e->name_len   = 0;
640 	e->back_num   = 0;
641 	e->back_alloc = 0;
642 	if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
643 	e->back_refs = (int* )NULL;
644       }
645     }
646     if (IS_NOT_NULL(t->e)) {
647       xfree(t->e);
648       t->e = NULL;
649     }
650     t->num = 0;
651   }
652   return 0;
653 }
654 
655 extern int
onig_names_free(regex_t * reg)656 onig_names_free(regex_t* reg)
657 {
658   int r;
659   NameTable* t;
660 
661   r = names_clear(reg);
662   if (r) return r;
663 
664   t = (NameTable* )reg->name_table;
665   if (IS_NOT_NULL(t)) xfree(t);
666   reg->name_table = NULL;
667   return 0;
668 }
669 
670 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)671 name_find(regex_t* reg, UChar* name, UChar* name_end)
672 {
673   int i, len;
674   NameEntry* e;
675   NameTable* t = (NameTable* )reg->name_table;
676 
677   if (IS_NOT_NULL(t)) {
678     len = name_end - name;
679     for (i = 0; i < t->num; i++) {
680       e = &(t->e[i]);
681       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
682 	return e;
683     }
684   }
685   return (NameEntry* )NULL;
686 }
687 
688 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)689 onig_foreach_name(regex_t* reg,
690   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
691 {
692   int i, r;
693   NameEntry* e;
694   NameTable* t = (NameTable* )reg->name_table;
695 
696   if (IS_NOT_NULL(t)) {
697     for (i = 0; i < t->num; i++) {
698       e = &(t->e[i]);
699       r = (*func)(e->name, e->name + e->name_len, e->back_num,
700 		  (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
701 		  reg, arg);
702       if (r != 0) return r;
703     }
704   }
705   return 0;
706 }
707 
708 extern int
onig_number_of_names(regex_t * reg)709 onig_number_of_names(regex_t* reg)
710 {
711   NameTable* t = (NameTable* )reg->name_table;
712 
713   if (IS_NOT_NULL(t))
714     return t->num;
715   else
716     return 0;
717 }
718 
719 #endif /* else USE_ST_LIBRARY */
720 
721 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)722 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
723 {
724   int alloc;
725   NameEntry* e;
726   NameTable* t = (NameTable* )reg->name_table;
727 
728   if (name_end - name <= 0)
729     return ONIGERR_EMPTY_GROUP_NAME;
730 
731   e = name_find(reg, name, name_end);
732   if (IS_NULL(e)) {
733 #ifdef USE_ST_LIBRARY
734     if (IS_NULL(t)) {
735       t = onig_st_init_strend_table_with_size(5);
736       CHECK_NULL_RETURN_MEMERR(t);
737       reg->name_table = (void* )t;
738     }
739     e = (NameEntry* )xmalloc(sizeof(NameEntry));
740     CHECK_NULL_RETURN_MEMERR(e);
741 
742     e->name = strdup_with_null(reg->enc, name, name_end);
743     if (IS_NULL(e->name)) {
744       xfree(e);  return ONIGERR_MEMORY;
745     }
746     onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
747                           (HashDataType )(UINTN)e);
748 
749     e->name_len   = (int)(name_end - name);
750     e->back_num   = 0;
751     e->back_alloc = 0;
752     e->back_refs  = (int* )NULL;
753 
754 #else
755 
756     if (IS_NULL(t)) {
757       alloc = INIT_NAMES_ALLOC_NUM;
758       t = (NameTable* )xmalloc(sizeof(NameTable));
759       CHECK_NULL_RETURN_MEMERR(t);
760       t->e     = NULL;
761       t->alloc = 0;
762       t->num   = 0;
763 
764       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
765       if (IS_NULL(t->e)) {
766 	xfree(t);
767 	return ONIGERR_MEMORY;
768       }
769       t->alloc = alloc;
770       reg->name_table = t;
771       goto clear;
772     }
773     else if (t->num == t->alloc) {
774       int i;
775 
776       alloc = t->alloc * 2;
777       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
778       CHECK_NULL_RETURN_MEMERR(t->e);
779       t->alloc = alloc;
780 
781     clear:
782       for (i = t->num; i < t->alloc; i++) {
783 	t->e[i].name       = NULL;
784 	t->e[i].name_len   = 0;
785 	t->e[i].back_num   = 0;
786 	t->e[i].back_alloc = 0;
787 	t->e[i].back_refs  = (int* )NULL;
788       }
789     }
790     e = &(t->e[t->num]);
791     t->num++;
792     e->name = strdup_with_null(reg->enc, name, name_end);
793     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
794     e->name_len = name_end - name;
795 #endif
796   }
797 
798   if (e->back_num >= 1 &&
799       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
800     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
801 				    name, name_end);
802     return ONIGERR_MULTIPLEX_DEFINED_NAME;
803   }
804 
805   e->back_num++;
806   if (e->back_num == 1) {
807     e->back_ref1 = backref;
808   }
809   else {
810     if (e->back_num == 2) {
811       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
812       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
813       CHECK_NULL_RETURN_MEMERR(e->back_refs);
814       e->back_alloc = alloc;
815       e->back_refs[0] = e->back_ref1;
816       e->back_refs[1] = backref;
817     }
818     else {
819       if (e->back_num > e->back_alloc) {
820 	alloc = e->back_alloc * 2;
821 	e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc, sizeof(int) * e->back_alloc);
822 	CHECK_NULL_RETURN_MEMERR(e->back_refs);
823 	e->back_alloc = alloc;
824       }
825       e->back_refs[e->back_num - 1] = backref;
826     }
827   }
828 
829   return 0;
830 }
831 
832 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)833 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
834 			   const UChar* name_end, int** nums)
835 {
836   NameEntry* e = name_find(reg, name, name_end);
837 
838   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
839 
840   switch (e->back_num) {
841   case 0:
842     break;
843   case 1:
844     *nums = &(e->back_ref1);
845     break;
846   default:
847     *nums = e->back_refs;
848     break;
849   }
850   return e->back_num;
851 }
852 
853 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)854 onig_name_to_backref_number(regex_t* reg, const UChar* name,
855 			    const UChar* name_end, OnigRegion *region)
856 {
857   int i, n, *nums;
858 
859   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
860   if (n < 0)
861     return n;
862   else if (n == 0)
863     return ONIGERR_PARSER_BUG;
864   else if (n == 1)
865     return nums[0];
866   else {
867     if (IS_NOT_NULL(region)) {
868       for (i = n - 1; i >= 0; i--) {
869 	if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
870 	  return nums[i];
871       }
872     }
873     return nums[n - 1];
874   }
875 }
876 
877 #else /* USE_NAMED_GROUP */
878 
879 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)880 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
881 			   const UChar* name_end, int** nums)
882 {
883   return ONIG_NO_SUPPORT_CONFIG;
884 }
885 
886 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)887 onig_name_to_backref_number(regex_t* reg, const UChar* name,
888 			    const UChar* name_end, OnigRegion* region)
889 {
890   return ONIG_NO_SUPPORT_CONFIG;
891 }
892 
893 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)894 onig_foreach_name(regex_t* reg,
895   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
896 {
897   return ONIG_NO_SUPPORT_CONFIG;
898 }
899 
900 extern int
onig_number_of_names(regex_t * reg)901 onig_number_of_names(regex_t* reg)
902 {
903   return 0;
904 }
905 #endif /* else USE_NAMED_GROUP */
906 
907 extern int
onig_noname_group_capture_is_active(regex_t * reg)908 onig_noname_group_capture_is_active(regex_t* reg)
909 {
910   if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
911     return 0;
912 
913 #ifdef USE_NAMED_GROUP
914   if (onig_number_of_names(reg) > 0 &&
915       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
916       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
917     return 0;
918   }
919 #endif
920 
921   return 1;
922 }
923 
924 
925 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE   16
926 
927 static void
scan_env_clear(ScanEnv * env)928 scan_env_clear(ScanEnv* env)
929 {
930   int i;
931 
932   BIT_STATUS_CLEAR(env->capture_history);
933   BIT_STATUS_CLEAR(env->bt_mem_start);
934   BIT_STATUS_CLEAR(env->bt_mem_end);
935   BIT_STATUS_CLEAR(env->backrefed_mem);
936   env->error      = (UChar* )NULL;
937   env->error_end  = (UChar* )NULL;
938   env->num_call   = 0;
939   env->num_mem    = 0;
940 #ifdef USE_NAMED_GROUP
941   env->num_named  = 0;
942 #endif
943   env->mem_alloc         = 0;
944   env->mem_nodes_dynamic = (Node** )NULL;
945 
946   for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
947     env->mem_nodes_static[i] = NULL_NODE;
948 
949 #ifdef USE_COMBINATION_EXPLOSION_CHECK
950   env->num_comb_exp_check  = 0;
951   env->comb_exp_max_regnum = 0;
952   env->curr_max_regnum     = 0;
953   env->has_recursion       = 0;
954 #endif
955 }
956 
957 static int
scan_env_add_mem_entry(ScanEnv * env)958 scan_env_add_mem_entry(ScanEnv* env)
959 {
960   int i, need, alloc;
961   Node** p;
962 
963   need = env->num_mem + 1;
964   if (need >= SCANENV_MEMNODES_SIZE) {
965     if (env->mem_alloc <= need) {
966       if (IS_NULL(env->mem_nodes_dynamic)) {
967 	alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
968 	p = (Node** )xmalloc(sizeof(Node*) * alloc);
969     CHECK_NULL_RETURN_MEMERR(p);
970 
971 	xmemcpy(p, env->mem_nodes_static,
972 		sizeof(Node*) * SCANENV_MEMNODES_SIZE);
973       }
974       else {
975 	alloc = env->mem_alloc * 2;
976 	p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc, sizeof(Node*) * env->mem_alloc);
977       }
978       CHECK_NULL_RETURN_MEMERR(p);
979 
980       for (i = env->num_mem + 1; i < alloc; i++)
981 	p[i] = NULL_NODE;
982 
983       env->mem_nodes_dynamic = p;
984       env->mem_alloc = alloc;
985     }
986   }
987 
988   env->num_mem++;
989   return env->num_mem;
990 }
991 
992 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)993 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
994 {
995   if (env->num_mem >= num)
996     SCANENV_MEM_NODES(env)[num] = node;
997   else
998     return ONIGERR_PARSER_BUG;
999   return 0;
1000 }
1001 
1002 
1003 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1004 typedef struct _FreeNode {
1005   struct _FreeNode* next;
1006 } FreeNode;
1007 
1008 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1009 #endif
1010 
1011 extern void
onig_node_free(Node * node)1012 onig_node_free(Node* node)
1013 {
1014  start:
1015   if (IS_NULL(node)) return ;
1016 
1017   switch (NTYPE(node)) {
1018   case NT_STR:
1019     if (NSTR(node)->capa != 0 &&
1020 	IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1021       xfree(NSTR(node)->s);
1022     }
1023     break;
1024 
1025   case NT_LIST:
1026   case NT_ALT:
1027     onig_node_free(NCAR(node));
1028     {
1029       Node* next_node = NCDR(node);
1030 
1031 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1032       {
1033 	FreeNode* n = (FreeNode* )node;
1034 
1035         THREAD_ATOMIC_START;
1036 	n->next = FreeNodeList;
1037 	FreeNodeList = n;
1038         THREAD_ATOMIC_END;
1039       }
1040 #else
1041       xfree(node);
1042 #endif
1043       node = next_node;
1044       goto start;
1045     }
1046     break;
1047 
1048   case NT_CCLASS:
1049     {
1050       CClassNode* cc = NCCLASS(node);
1051 
1052       if (IS_NCCLASS_SHARE(cc)) return ;
1053       if (cc->mbuf)
1054         bbuf_free(cc->mbuf);
1055     }
1056     break;
1057 
1058   case NT_QTFR:
1059     if (NQTFR(node)->target)
1060       onig_node_free(NQTFR(node)->target);
1061     break;
1062 
1063   case NT_ENCLOSE:
1064     if (NENCLOSE(node)->target)
1065       onig_node_free(NENCLOSE(node)->target);
1066     break;
1067 
1068   case NT_BREF:
1069     if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1070       xfree(NBREF(node)->back_dynamic);
1071     break;
1072 
1073   case NT_ANCHOR:
1074     if (NANCHOR(node)->target)
1075       onig_node_free(NANCHOR(node)->target);
1076     break;
1077   }
1078 
1079 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1080   {
1081     FreeNode* n = (FreeNode* )node;
1082 
1083     THREAD_ATOMIC_START;
1084     n->next = FreeNodeList;
1085     FreeNodeList = n;
1086     THREAD_ATOMIC_END;
1087   }
1088 #else
1089   xfree(node);
1090 #endif
1091 }
1092 
1093 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1094 extern int
onig_free_node_list(void)1095 onig_free_node_list(void)
1096 {
1097   FreeNode* n;
1098 
1099   /* THREAD_ATOMIC_START; */
1100   while (IS_NOT_NULL(FreeNodeList)) {
1101     n = FreeNodeList;
1102     FreeNodeList = FreeNodeList->next;
1103     xfree(n);
1104   }
1105   /* THREAD_ATOMIC_END; */
1106   return 0;
1107 }
1108 #endif
1109 
1110 static Node*
node_new(void)1111 node_new(void)
1112 {
1113   Node* node;
1114 
1115 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1116   THREAD_ATOMIC_START;
1117   if (IS_NOT_NULL(FreeNodeList)) {
1118     node = (Node* )FreeNodeList;
1119     FreeNodeList = FreeNodeList->next;
1120     THREAD_ATOMIC_END;
1121     return node;
1122   }
1123   THREAD_ATOMIC_END;
1124 #endif
1125 
1126   node = (Node* )xmalloc(sizeof(Node));
1127   /* xmemset(node, 0, sizeof(Node)); */
1128   return node;
1129 }
1130 
1131 
1132 static void
initialize_cclass(CClassNode * cc)1133 initialize_cclass(CClassNode* cc)
1134 {
1135   BITSET_CLEAR(cc->bs);
1136   /* cc->base.flags = 0; */
1137   cc->flags = 0;
1138   cc->mbuf  = NULL;
1139 }
1140 
1141 static Node*
node_new_cclass(void)1142 node_new_cclass(void)
1143 {
1144   Node* node = node_new();
1145   CHECK_NULL_RETURN(node);
1146 
1147   SET_NTYPE(node, NT_CCLASS);
1148   initialize_cclass(NCCLASS(node));
1149   return node;
1150 }
1151 
1152 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1153 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1154 				   const OnigCodePoint ranges[])
1155 {
1156   int n, i;
1157   CClassNode* cc;
1158   OnigCodePoint j;
1159 
1160   Node* node = node_new_cclass();
1161   CHECK_NULL_RETURN(node);
1162 
1163   cc = NCCLASS(node);
1164   if (not != 0) NCCLASS_SET_NOT(cc);
1165 
1166   BITSET_CLEAR(cc->bs);
1167   if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1168     n = ONIGENC_CODE_RANGE_NUM(ranges);
1169     for (i = 0; i < n; i++) {
1170       for (j  = ONIGENC_CODE_RANGE_FROM(ranges, i);
1171            j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1172 	if (j >= sb_out) goto sb_end;
1173 
1174         BITSET_SET_BIT(cc->bs, j);
1175       }
1176     }
1177   }
1178 
1179  sb_end:
1180   if (IS_NULL(ranges)) {
1181   is_null:
1182     cc->mbuf = NULL;
1183   }
1184   else {
1185     BBuf* bbuf;
1186 
1187     n = ONIGENC_CODE_RANGE_NUM(ranges);
1188     if (n == 0) goto is_null;
1189 
1190     bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1191     CHECK_NULL_RETURN(bbuf);
1192     bbuf->alloc = n + 1;
1193     bbuf->used  = n + 1;
1194     bbuf->p     = (UChar* )((void* )ranges);
1195 
1196     cc->mbuf = bbuf;
1197   }
1198 
1199   return node;
1200 }
1201 
1202 static Node*
node_new_ctype(int type,int not)1203 node_new_ctype(int type, int not)
1204 {
1205   Node* node = node_new();
1206   CHECK_NULL_RETURN(node);
1207 
1208   SET_NTYPE(node, NT_CTYPE);
1209   NCTYPE(node)->ctype = type;
1210   NCTYPE(node)->not   = not;
1211   return node;
1212 }
1213 
1214 static Node*
node_new_anychar(void)1215 node_new_anychar(void)
1216 {
1217   Node* node = node_new();
1218   CHECK_NULL_RETURN(node);
1219 
1220   SET_NTYPE(node, NT_CANY);
1221   return node;
1222 }
1223 
1224 static Node*
node_new_list(Node * left,Node * right)1225 node_new_list(Node* left, Node* right)
1226 {
1227   Node* node = node_new();
1228   CHECK_NULL_RETURN(node);
1229 
1230   SET_NTYPE(node, NT_LIST);
1231   NCAR(node)  = left;
1232   NCDR(node) = right;
1233   return node;
1234 }
1235 
1236 extern Node*
onig_node_new_list(Node * left,Node * right)1237 onig_node_new_list(Node* left, Node* right)
1238 {
1239   return node_new_list(left, right);
1240 }
1241 
1242 extern Node*
onig_node_list_add(Node * list,Node * x)1243 onig_node_list_add(Node* list, Node* x)
1244 {
1245   Node *n;
1246 
1247   n = onig_node_new_list(x, NULL);
1248   if (IS_NULL(n)) return NULL_NODE;
1249 
1250   if (IS_NOT_NULL(list)) {
1251     while (IS_NOT_NULL(NCDR(list)))
1252       list = NCDR(list);
1253 
1254     NCDR(list) = n;
1255   }
1256 
1257   return n;
1258 }
1259 
1260 extern Node*
onig_node_new_alt(Node * left,Node * right)1261 onig_node_new_alt(Node* left, Node* right)
1262 {
1263   Node* node = node_new();
1264   CHECK_NULL_RETURN(node);
1265 
1266   SET_NTYPE(node, NT_ALT);
1267   NCAR(node)  = left;
1268   NCDR(node) = right;
1269   return node;
1270 }
1271 
1272 extern Node*
onig_node_new_anchor(int type)1273 onig_node_new_anchor(int type)
1274 {
1275   Node* node = node_new();
1276   CHECK_NULL_RETURN(node);
1277 
1278   SET_NTYPE(node, NT_ANCHOR);
1279   NANCHOR(node)->type     = type;
1280   NANCHOR(node)->target   = NULL;
1281   NANCHOR(node)->char_len = -1;
1282   return node;
1283 }
1284 
1285 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1286 node_new_backref(int back_num, int* backrefs, int by_name,
1287 #ifdef USE_BACKREF_WITH_LEVEL
1288 		 int exist_level, int nest_level,
1289 #endif
1290 		 ScanEnv* env)
1291 {
1292   int i;
1293   Node* node = node_new();
1294 
1295   CHECK_NULL_RETURN(node);
1296 
1297   SET_NTYPE(node, NT_BREF);
1298   NBREF(node)->state    = 0;
1299   NBREF(node)->back_num = back_num;
1300   NBREF(node)->back_dynamic = (int* )NULL;
1301   if (by_name != 0)
1302     NBREF(node)->state |= NST_NAME_REF;
1303 
1304 #ifdef USE_BACKREF_WITH_LEVEL
1305   if (exist_level != 0) {
1306     NBREF(node)->state |= NST_NEST_LEVEL;
1307     NBREF(node)->nest_level  = nest_level;
1308   }
1309 #endif
1310 
1311   for (i = 0; i < back_num; i++) {
1312     if (backrefs[i] <= env->num_mem &&
1313 	IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1314       NBREF(node)->state |= NST_RECURSION;   /* /...(\1).../ */
1315       break;
1316     }
1317   }
1318 
1319   if (back_num <= NODE_BACKREFS_SIZE) {
1320     for (i = 0; i < back_num; i++)
1321       NBREF(node)->back_static[i] = backrefs[i];
1322   }
1323   else {
1324     int* p = (int* )xmalloc(sizeof(int) * back_num);
1325     if (IS_NULL(p)) {
1326       onig_node_free(node);
1327       return NULL;
1328     }
1329     NBREF(node)->back_dynamic = p;
1330     for (i = 0; i < back_num; i++)
1331       p[i] = backrefs[i];
1332   }
1333   return node;
1334 }
1335 
1336 #ifdef USE_SUBEXP_CALL
1337 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1338 node_new_call(UChar* name, UChar* name_end, int gnum)
1339 {
1340   Node* node = node_new();
1341   CHECK_NULL_RETURN(node);
1342 
1343   SET_NTYPE(node, NT_CALL);
1344   NCALL(node)->state     = 0;
1345   NCALL(node)->target    = NULL_NODE;
1346   NCALL(node)->name      = name;
1347   NCALL(node)->name_end  = name_end;
1348   NCALL(node)->group_num = gnum;  /* call by number if gnum != 0 */
1349   return node;
1350 }
1351 #endif
1352 
1353 static Node*
node_new_quantifier(int lower,int upper,int by_number)1354 node_new_quantifier(int lower, int upper, int by_number)
1355 {
1356   Node* node = node_new();
1357   CHECK_NULL_RETURN(node);
1358 
1359   SET_NTYPE(node, NT_QTFR);
1360   NQTFR(node)->state  = 0;
1361   NQTFR(node)->target = NULL;
1362   NQTFR(node)->lower  = lower;
1363   NQTFR(node)->upper  = upper;
1364   NQTFR(node)->greedy = 1;
1365   NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1366   NQTFR(node)->head_exact        = NULL_NODE;
1367   NQTFR(node)->next_head_exact   = NULL_NODE;
1368   NQTFR(node)->is_refered        = 0;
1369   if (by_number != 0)
1370     NQTFR(node)->state |= NST_BY_NUMBER;
1371 
1372 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1373   NQTFR(node)->comb_exp_check_num = 0;
1374 #endif
1375 
1376   return node;
1377 }
1378 
1379 static Node*
node_new_enclose(int type)1380 node_new_enclose(int type)
1381 {
1382   Node* node = node_new();
1383   CHECK_NULL_RETURN(node);
1384 
1385   SET_NTYPE(node, NT_ENCLOSE);
1386   NENCLOSE(node)->type      = type;
1387   NENCLOSE(node)->state     =  0;
1388   NENCLOSE(node)->regnum    =  0;
1389   NENCLOSE(node)->option    =  0;
1390   NENCLOSE(node)->target    = NULL;
1391   NENCLOSE(node)->call_addr = -1;
1392   NENCLOSE(node)->opt_count =  0;
1393   return node;
1394 }
1395 
1396 extern Node*
onig_node_new_enclose(int type)1397 onig_node_new_enclose(int type)
1398 {
1399   return node_new_enclose(type);
1400 }
1401 
1402 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1403 node_new_enclose_memory(OnigOptionType option, int is_named)
1404 {
1405   Node* node = node_new_enclose(ENCLOSE_MEMORY);
1406   CHECK_NULL_RETURN(node);
1407   if (is_named != 0)
1408     SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1409 
1410 #ifdef USE_SUBEXP_CALL
1411   NENCLOSE(node)->option = option;
1412 #endif
1413   return node;
1414 }
1415 
1416 static Node*
node_new_option(OnigOptionType option)1417 node_new_option(OnigOptionType option)
1418 {
1419   Node* node = node_new_enclose(ENCLOSE_OPTION);
1420   CHECK_NULL_RETURN(node);
1421   NENCLOSE(node)->option = option;
1422   return node;
1423 }
1424 
1425 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1426 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1427 {
1428   int addlen = (int)(end - s);
1429 
1430   if (addlen > 0) {
1431     int len  = (int)(NSTR(node)->end - NSTR(node)->s);
1432 
1433     if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1434       UChar* p;
1435       int capa = len + addlen + NODE_STR_MARGIN;
1436 
1437       if (capa <= NSTR(node)->capa) {
1438 	onig_strcpy(NSTR(node)->s + len, s, end);
1439       }
1440       else {
1441 	if (NSTR(node)->s == NSTR(node)->buf)
1442 	  p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1443 				      s, end, capa);
1444 	else
1445 	  p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa, NSTR(node)->capa);
1446 
1447 	CHECK_NULL_RETURN_MEMERR(p);
1448 	NSTR(node)->s    = p;
1449 	NSTR(node)->capa = capa;
1450       }
1451     }
1452     else {
1453       onig_strcpy(NSTR(node)->s + len, s, end);
1454     }
1455     NSTR(node)->end = NSTR(node)->s + len + addlen;
1456   }
1457 
1458   return 0;
1459 }
1460 
1461 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1462 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1463 {
1464   onig_node_str_clear(node);
1465   return onig_node_str_cat(node, s, end);
1466 }
1467 
1468 static int
node_str_cat_char(Node * node,UChar c)1469 node_str_cat_char(Node* node, UChar c)
1470 {
1471   UChar s[1];
1472 
1473   s[0] = c;
1474   return onig_node_str_cat(node, s, s + 1);
1475 }
1476 
1477 extern void
onig_node_conv_to_str_node(Node * node,int flag)1478 onig_node_conv_to_str_node(Node* node, int flag)
1479 {
1480   SET_NTYPE(node, NT_STR);
1481   NSTR(node)->flag = flag;
1482   NSTR(node)->capa = 0;
1483   NSTR(node)->s    = NSTR(node)->buf;
1484   NSTR(node)->end  = NSTR(node)->buf;
1485 }
1486 
1487 extern void
onig_node_str_clear(Node * node)1488 onig_node_str_clear(Node* node)
1489 {
1490   if (NSTR(node)->capa != 0 &&
1491       IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1492     xfree(NSTR(node)->s);
1493   }
1494 
1495   NSTR(node)->capa = 0;
1496   NSTR(node)->flag = 0;
1497   NSTR(node)->s    = NSTR(node)->buf;
1498   NSTR(node)->end  = NSTR(node)->buf;
1499 }
1500 
1501 static Node*
node_new_str(const UChar * s,const UChar * end)1502 node_new_str(const UChar* s, const UChar* end)
1503 {
1504   Node* node = node_new();
1505   CHECK_NULL_RETURN(node);
1506 
1507   SET_NTYPE(node, NT_STR);
1508   NSTR(node)->capa = 0;
1509   NSTR(node)->flag = 0;
1510   NSTR(node)->s    = NSTR(node)->buf;
1511   NSTR(node)->end  = NSTR(node)->buf;
1512   if (onig_node_str_cat(node, s, end)) {
1513     onig_node_free(node);
1514     return NULL;
1515   }
1516   return node;
1517 }
1518 
1519 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1520 onig_node_new_str(const UChar* s, const UChar* end)
1521 {
1522   return node_new_str(s, end);
1523 }
1524 
1525 static Node*
node_new_str_raw(UChar * s,UChar * end)1526 node_new_str_raw(UChar* s, UChar* end)
1527 {
1528   Node* node = node_new_str(s, end);
1529   CHECK_NULL_RETURN(node);
1530   NSTRING_SET_RAW(node);
1531   return node;
1532 }
1533 
1534 static Node*
node_new_empty(void)1535 node_new_empty(void)
1536 {
1537   return node_new_str(NULL, NULL);
1538 }
1539 
1540 static Node*
node_new_str_raw_char(UChar c)1541 node_new_str_raw_char(UChar c)
1542 {
1543   UChar p[1];
1544 
1545   p[0] = c;
1546   return node_new_str_raw(p, p + 1);
1547 }
1548 
1549 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1550 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1551 {
1552   const UChar *p;
1553   Node* n = NULL_NODE;
1554 
1555   if (sn->end > sn->s) {
1556     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1557     if (p && p > sn->s) { /* can be splitted. */
1558       n = node_new_str(p, sn->end);
1559       CHECK_NULL_RETURN(n);
1560       if ((sn->flag & NSTR_RAW) != 0)
1561 	NSTRING_SET_RAW(n);
1562       sn->end = (UChar* )p;
1563     }
1564   }
1565   return n;
1566 }
1567 
1568 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1569 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1570 {
1571   if (sn->end > sn->s) {
1572     return ((enclen(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
1573   }
1574   return 0;
1575 }
1576 
1577 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1578 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1579 node_str_head_pad(StrNode* sn, int num, UChar val)
1580 {
1581   UChar buf[NODE_STR_BUF_SIZE];
1582   int i, len;
1583 
1584   len = sn->end - sn->s;
1585   onig_strcpy(buf, sn->s, sn->end);
1586   onig_strcpy(&(sn->s[num]), buf, buf + len);
1587   sn->end += num;
1588 
1589   for (i = 0; i < num; i++) {
1590     sn->s[i] = val;
1591   }
1592 }
1593 #endif
1594 
1595 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1596 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1597 {
1598   unsigned int num, val;
1599   OnigCodePoint c;
1600   UChar* p = *src;
1601   PFETCH_READY;
1602 
1603   num = 0;
1604   while (!PEND) {
1605     PFETCH(c);
1606     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1607       val = (unsigned int )DIGITVAL(c);
1608       if ((INT_MAX_LIMIT - val) / 10UL < num)
1609 	return -1;  /* overflow */
1610 
1611       num = num * 10 + val;
1612     }
1613     else {
1614       PUNFETCH;
1615       break;
1616     }
1617   }
1618   *src = p;
1619   return num;
1620 }
1621 
1622 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1623 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1624 				 OnigEncoding enc)
1625 {
1626   OnigCodePoint c;
1627   unsigned int num, val;
1628   UChar* p = *src;
1629   PFETCH_READY;
1630 
1631   num = 0;
1632   while (!PEND && maxlen-- != 0) {
1633     PFETCH(c);
1634     if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1635       val = (unsigned int )XDIGITVAL(enc,c);
1636       if ((INT_MAX_LIMIT - val) / 16UL < num)
1637 	return -1;  /* overflow */
1638 
1639       num = (num << 4) + XDIGITVAL(enc,c);
1640     }
1641     else {
1642       PUNFETCH;
1643       break;
1644     }
1645   }
1646   *src = p;
1647   return num;
1648 }
1649 
1650 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1651 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1652 			   OnigEncoding enc)
1653 {
1654   OnigCodePoint c;
1655   unsigned int num, val;
1656   UChar* p = *src;
1657   PFETCH_READY;
1658 
1659   num = 0;
1660   while (!PEND && maxlen-- != 0) {
1661     PFETCH(c);
1662     if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1663       val = ODIGITVAL(c);
1664       if ((INT_MAX_LIMIT - val) / 8UL < num)
1665 	return -1;  /* overflow */
1666 
1667       num = (num << 3) + val;
1668     }
1669     else {
1670       PUNFETCH;
1671       break;
1672     }
1673   }
1674   *src = p;
1675   return num;
1676 }
1677 
1678 
1679 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1680     BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1681 
1682 /* data format:
1683      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1684      (all data size is OnigCodePoint)
1685  */
1686 static int
new_code_range(BBuf ** pbuf)1687 new_code_range(BBuf** pbuf)
1688 {
1689 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
1690   int r;
1691   OnigCodePoint n;
1692   BBuf* bbuf;
1693 
1694   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1695   CHECK_NULL_RETURN_MEMERR(*pbuf);
1696   r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1697   if (r) return r;
1698 
1699   n = 0;
1700   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1701   return 0;
1702 }
1703 
1704 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1705 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1706 {
1707   int r, inc_n, pos;
1708   int low, high, bound, x;
1709   OnigCodePoint n, *data;
1710   BBuf* bbuf;
1711 
1712   if (from > to) {
1713     n = from; from = to; to = n;
1714   }
1715 
1716   if (IS_NULL(*pbuf)) {
1717     r = new_code_range(pbuf);
1718     if (r) return r;
1719     bbuf = *pbuf;
1720     n = 0;
1721   }
1722   else {
1723     bbuf = *pbuf;
1724     GET_CODE_POINT(n, bbuf->p);
1725   }
1726   data = (OnigCodePoint* )(bbuf->p);
1727   data++;
1728 
1729   for (low = 0, bound = n; low < bound; ) {
1730     x = (low + bound) >> 1;
1731     if (from > data[x*2 + 1])
1732       low = x + 1;
1733     else
1734       bound = x;
1735   }
1736 
1737   for (high = low, bound = n; high < bound; ) {
1738     x = (high + bound) >> 1;
1739     if (to >= data[x*2] - 1)
1740       high = x + 1;
1741     else
1742       bound = x;
1743   }
1744 
1745   inc_n = low + 1 - high;
1746   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1747     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1748 
1749   if (inc_n != 1) {
1750     if (from > data[low*2])
1751       from = data[low*2];
1752     if (to < data[(high - 1)*2 + 1])
1753       to = data[(high - 1)*2 + 1];
1754   }
1755 
1756   if (inc_n != 0 && (OnigCodePoint )high < n) {
1757     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1758     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1759     int size = (n - high) * 2 * SIZE_CODE_POINT;
1760 
1761     if (inc_n > 0) {
1762       BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1763     }
1764     else {
1765       BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1766     }
1767   }
1768 
1769   pos = SIZE_CODE_POINT * (1 + low * 2);
1770   BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1771   BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1772   BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1773   n += inc_n;
1774   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1775 
1776   return 0;
1777 }
1778 
1779 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1780 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1781 {
1782   if (from > to) {
1783     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1784       return 0;
1785     else
1786       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1787   }
1788 
1789   return add_code_range_to_buf(pbuf, from, to);
1790 }
1791 
1792 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1793 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1794 {
1795   int r, i, n;
1796   OnigCodePoint pre, from, *data, to = 0;
1797 
1798   *pbuf = (BBuf* )NULL;
1799   if (IS_NULL(bbuf)) {
1800   set_all:
1801     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1802   }
1803 
1804   data = (OnigCodePoint* )(bbuf->p);
1805   GET_CODE_POINT(n, data);
1806   data++;
1807   if (n <= 0) goto set_all;
1808 
1809   r = 0;
1810   pre = MBCODE_START_POS(enc);
1811   for (i = 0; i < n; i++) {
1812     from = data[i*2];
1813     to   = data[i*2+1];
1814     if (pre <= from - 1) {
1815       r = add_code_range_to_buf(pbuf, pre, from - 1);
1816       if (r != 0) return r;
1817     }
1818     if (to == ~((OnigCodePoint )0)) break;
1819     pre = to + 1;
1820   }
1821   if (to < ~((OnigCodePoint )0)) {
1822     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1823   }
1824   return r;
1825 }
1826 
1827 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1828   BBuf *tbuf; \
1829   int  tnot; \
1830   tnot = not1;  not1  = not2;  not2  = tnot; \
1831   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1832 } while (0)
1833 
1834 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1835 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1836                   BBuf* bbuf2, int not2, BBuf** pbuf)
1837 {
1838   int r;
1839   OnigCodePoint i, n1, *data1;
1840   OnigCodePoint from, to;
1841 
1842   *pbuf = (BBuf* )NULL;
1843   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1844     if (not1 != 0 || not2 != 0)
1845       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1846     return 0;
1847   }
1848 
1849   r = 0;
1850   if (IS_NULL(bbuf2))
1851     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1852 
1853   if (IS_NULL(bbuf1)) {
1854     if (not1 != 0) {
1855       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1856     }
1857     else {
1858       if (not2 == 0) {
1859 	return bbuf_clone(pbuf, bbuf2);
1860       }
1861       else {
1862 	return not_code_range_buf(enc, bbuf2, pbuf);
1863       }
1864     }
1865   }
1866 
1867   if (not1 != 0)
1868     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1869 
1870   data1 = (OnigCodePoint* )(bbuf1->p);
1871   GET_CODE_POINT(n1, data1);
1872   data1++;
1873 
1874   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1875     r = bbuf_clone(pbuf, bbuf2);
1876   }
1877   else if (not1 == 0) { /* 1 OR (not 2) */
1878     r = not_code_range_buf(enc, bbuf2, pbuf);
1879   }
1880   if (r != 0) return r;
1881 
1882   for (i = 0; i < n1; i++) {
1883     from = data1[i*2];
1884     to   = data1[i*2+1];
1885     r = add_code_range_to_buf(pbuf, from, to);
1886     if (r != 0) return r;
1887   }
1888   return 0;
1889 }
1890 
1891 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1892 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1893 	        OnigCodePoint* data, int n)
1894 {
1895   int i, r;
1896   OnigCodePoint from2, to2;
1897 
1898   for (i = 0; i < n; i++) {
1899     from2 = data[i*2];
1900     to2   = data[i*2+1];
1901     if (from2 < from1) {
1902       if (to2 < from1) continue;
1903       else {
1904 	from1 = to2 + 1;
1905       }
1906     }
1907     else if (from2 <= to1) {
1908       if (to2 < to1) {
1909 	if (from1 <= from2 - 1) {
1910 	  r = add_code_range_to_buf(pbuf, from1, from2-1);
1911 	  if (r != 0) return r;
1912 	}
1913 	from1 = to2 + 1;
1914       }
1915       else {
1916 	to1 = from2 - 1;
1917       }
1918     }
1919     else {
1920       from1 = from2;
1921     }
1922     if (from1 > to1) break;
1923   }
1924   if (from1 <= to1) {
1925     r = add_code_range_to_buf(pbuf, from1, to1);
1926     if (r != 0) return r;
1927   }
1928   return 0;
1929 }
1930 
1931 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1932 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1933 {
1934   int r;
1935   OnigCodePoint i, j, n1, n2, *data1, *data2;
1936   OnigCodePoint from, to, from1, to1, from2, to2;
1937 
1938   *pbuf = (BBuf* )NULL;
1939   if (IS_NULL(bbuf1)) {
1940     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1941       return bbuf_clone(pbuf, bbuf2);
1942     return 0;
1943   }
1944   else if (IS_NULL(bbuf2)) {
1945     if (not2 != 0)
1946       return bbuf_clone(pbuf, bbuf1);
1947     return 0;
1948   }
1949 
1950   if (not1 != 0)
1951     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1952 
1953   data1 = (OnigCodePoint* )(bbuf1->p);
1954   data2 = (OnigCodePoint* )(bbuf2->p);
1955   GET_CODE_POINT(n1, data1);
1956   GET_CODE_POINT(n2, data2);
1957   data1++;
1958   data2++;
1959 
1960   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1961     for (i = 0; i < n1; i++) {
1962       from1 = data1[i*2];
1963       to1   = data1[i*2+1];
1964       for (j = 0; j < n2; j++) {
1965 	from2 = data2[j*2];
1966 	to2   = data2[j*2+1];
1967 	if (from2 > to1) break;
1968 	if (to2 < from1) continue;
1969 	from = MAX(from1, from2);
1970 	to   = MIN(to1, to2);
1971 	r = add_code_range_to_buf(pbuf, from, to);
1972 	if (r != 0) return r;
1973       }
1974     }
1975   }
1976   else if (not1 == 0) { /* 1 AND (not 2) */
1977     for (i = 0; i < n1; i++) {
1978       from1 = data1[i*2];
1979       to1   = data1[i*2+1];
1980       r = and_code_range1(pbuf, from1, to1, data2, n2);
1981       if (r != 0) return r;
1982     }
1983   }
1984 
1985   return 0;
1986 }
1987 
1988 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1989 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1990 {
1991   int r, not1, not2;
1992   BBuf *buf1, *buf2, *pbuf;
1993   BitSetRef bsr1, bsr2;
1994   BitSet bs1, bs2;
1995 
1996   not1 = IS_NCCLASS_NOT(dest);
1997   bsr1 = dest->bs;
1998   buf1 = dest->mbuf;
1999   not2 = IS_NCCLASS_NOT(cc);
2000   bsr2 = cc->bs;
2001   buf2 = cc->mbuf;
2002 
2003   if (not1 != 0) {
2004     bitset_invert_to(bsr1, bs1);
2005     bsr1 = bs1;
2006   }
2007   if (not2 != 0) {
2008     bitset_invert_to(bsr2, bs2);
2009     bsr2 = bs2;
2010   }
2011   bitset_and(bsr1, bsr2);
2012   if (bsr1 != dest->bs) {
2013     bitset_copy(dest->bs, bsr1);
2014     bsr1 = dest->bs;
2015   }
2016   if (not1 != 0) {
2017     bitset_invert(dest->bs);
2018   }
2019 
2020   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2021     if (not1 != 0 && not2 != 0) {
2022       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2023     }
2024     else {
2025       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2026       if (r == 0 && not1 != 0) {
2027 	BBuf *tbuf;
2028 	r = not_code_range_buf(enc, pbuf, &tbuf);
2029 	if (r != 0) {
2030 	  bbuf_free(pbuf);
2031 	  return r;
2032 	}
2033 	bbuf_free(pbuf);
2034 	pbuf = tbuf;
2035       }
2036     }
2037     if (r != 0) return r;
2038 
2039     dest->mbuf = pbuf;
2040     bbuf_free(buf1);
2041     return r;
2042   }
2043   return 0;
2044 }
2045 
2046 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2047 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2048 {
2049   int r, not1, not2;
2050   BBuf *buf1, *buf2, *pbuf;
2051   BitSetRef bsr1, bsr2;
2052   BitSet bs1, bs2;
2053 
2054   not1 = IS_NCCLASS_NOT(dest);
2055   bsr1 = dest->bs;
2056   buf1 = dest->mbuf;
2057   not2 = IS_NCCLASS_NOT(cc);
2058   bsr2 = cc->bs;
2059   buf2 = cc->mbuf;
2060 
2061   if (not1 != 0) {
2062     bitset_invert_to(bsr1, bs1);
2063     bsr1 = bs1;
2064   }
2065   if (not2 != 0) {
2066     bitset_invert_to(bsr2, bs2);
2067     bsr2 = bs2;
2068   }
2069   bitset_or(bsr1, bsr2);
2070   if (bsr1 != dest->bs) {
2071     bitset_copy(dest->bs, bsr1);
2072     bsr1 = dest->bs;
2073   }
2074   if (not1 != 0) {
2075     bitset_invert(dest->bs);
2076   }
2077 
2078   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2079     if (not1 != 0 && not2 != 0) {
2080       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2081     }
2082     else {
2083       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2084       if (r == 0 && not1 != 0) {
2085 	BBuf *tbuf;
2086 	r = not_code_range_buf(enc, pbuf, &tbuf);
2087 	if (r != 0) {
2088 	  bbuf_free(pbuf);
2089 	  return r;
2090 	}
2091 	bbuf_free(pbuf);
2092 	pbuf = tbuf;
2093       }
2094     }
2095     if (r != 0) return r;
2096 
2097     dest->mbuf = pbuf;
2098     bbuf_free(buf1);
2099     return r;
2100   }
2101   else
2102     return 0;
2103 }
2104 
2105 static int
conv_backslash_value(int c,ScanEnv * env)2106 conv_backslash_value(int c, ScanEnv* env)
2107 {
2108   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2109     switch (c) {
2110     case 'n': return '\n';
2111     case 't': return '\t';
2112     case 'r': return '\r';
2113     case 'f': return '\f';
2114     case 'a': return '\007';
2115     case 'b': return '\010';
2116     case 'e': return '\033';
2117     case 'v':
2118       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2119 	return '\v';
2120       break;
2121 
2122     default:
2123       break;
2124     }
2125   }
2126   return c;
2127 }
2128 
2129 static int
is_invalid_quantifier_target(Node * node)2130 is_invalid_quantifier_target(Node* node)
2131 {
2132   switch (NTYPE(node)) {
2133   case NT_ANCHOR:
2134     return 1;
2135     break;
2136 
2137   case NT_ENCLOSE:
2138     /* allow enclosed elements */
2139     /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2140     break;
2141 
2142   case NT_LIST:
2143     do {
2144       if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2145     } while (IS_NOT_NULL(node = NCDR(node)));
2146     return 0;
2147     break;
2148 
2149   case NT_ALT:
2150     do {
2151       if (is_invalid_quantifier_target(NCAR(node))) return 1;
2152     } while (IS_NOT_NULL(node = NCDR(node)));
2153     break;
2154 
2155   default:
2156     break;
2157   }
2158   return 0;
2159 }
2160 
2161 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2162 static int
popular_quantifier_num(QtfrNode * q)2163 popular_quantifier_num(QtfrNode* q)
2164 {
2165   if (q->greedy) {
2166     if (q->lower == 0) {
2167       if (q->upper == 1) return 0;
2168       else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2169     }
2170     else if (q->lower == 1) {
2171       if (IS_REPEAT_INFINITE(q->upper)) return 2;
2172     }
2173   }
2174   else {
2175     if (q->lower == 0) {
2176       if (q->upper == 1) return 3;
2177       else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2178     }
2179     else if (q->lower == 1) {
2180       if (IS_REPEAT_INFINITE(q->upper)) return 5;
2181     }
2182   }
2183   return -1;
2184 }
2185 
2186 
2187 enum ReduceType {
2188   RQ_ASIS = 0, /* as is */
2189   RQ_DEL  = 1, /* delete parent */
2190   RQ_A,        /* to '*'    */
2191   RQ_AQ,       /* to '*?'   */
2192   RQ_QQ,       /* to '??'   */
2193   RQ_P_QQ,     /* to '+)??' */
2194   RQ_PQ_Q      /* to '+?)?' */
2195 };
2196 
2197 static enum ReduceType ReduceTypeTable[6][6] = {
2198   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
2199   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
2200   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
2201   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
2202   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
2203   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
2204 };
2205 
2206 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2207 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2208 {
2209   int pnum, cnum;
2210   QtfrNode *p, *c;
2211 
2212   p = NQTFR(pnode);
2213   c = NQTFR(cnode);
2214   pnum = popular_quantifier_num(p);
2215   cnum = popular_quantifier_num(c);
2216   if (pnum < 0 || cnum < 0) return ;
2217 
2218   switch(ReduceTypeTable[cnum][pnum]) {
2219   case RQ_DEL:
2220     CopyMem (pnode, cnode, sizeof (Node));
2221     break;
2222   case RQ_A:
2223     p->target = c->target;
2224     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1;
2225     break;
2226   case RQ_AQ:
2227     p->target = c->target;
2228     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0;
2229     break;
2230   case RQ_QQ:
2231     p->target = c->target;
2232     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2233     break;
2234   case RQ_P_QQ:
2235     p->target = cnode;
2236     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2237     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1;
2238     return ;
2239     break;
2240   case RQ_PQ_Q:
2241     p->target = cnode;
2242     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
2243     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0;
2244     return ;
2245     break;
2246   case RQ_ASIS:
2247     p->target = cnode;
2248     return ;
2249     break;
2250   }
2251 
2252   c->target = NULL_NODE;
2253   onig_node_free(cnode);
2254 }
2255 
2256 
2257 enum TokenSyms {
2258   TK_EOT      = 0,   /* end of token */
2259   TK_RAW_BYTE = 1,
2260   TK_CHAR,
2261   TK_STRING,
2262   TK_CODE_POINT,
2263   TK_ANYCHAR,
2264   TK_CHAR_TYPE,
2265   TK_BACKREF,
2266   TK_CALL,
2267   TK_ANCHOR,
2268   TK_OP_REPEAT,
2269   TK_INTERVAL,
2270   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
2271   TK_ALT,
2272   TK_SUBEXP_OPEN,
2273   TK_SUBEXP_CLOSE,
2274   TK_CC_OPEN,
2275   TK_QUOTE_OPEN,
2276   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
2277   /* in cc */
2278   TK_CC_CLOSE,
2279   TK_CC_RANGE,
2280   TK_POSIX_BRACKET_OPEN,
2281   TK_CC_AND,             /* && */
2282   TK_CC_CC_OPEN          /* [ */
2283 };
2284 
2285 typedef struct {
2286   enum TokenSyms type;
2287   int escaped;
2288   int base;   /* is number: 8, 16 (used in [....]) */
2289   UChar* backp;
2290   union {
2291     UChar* s;
2292     int   c;
2293     OnigCodePoint code;
2294     int   anchor;
2295     int   subtype;
2296     struct {
2297       int lower;
2298       int upper;
2299       int greedy;
2300       int possessive;
2301     } repeat;
2302     struct {
2303       int  num;
2304       int  ref1;
2305       int* refs;
2306       int  by_name;
2307 #ifdef USE_BACKREF_WITH_LEVEL
2308       int  exist_level;
2309       int  level;   /* \k<name+n> */
2310 #endif
2311     } backref;
2312     struct {
2313       UChar* name;
2314       UChar* name_end;
2315       int    gnum;
2316     } call;
2317     struct {
2318       int ctype;
2319       int not;
2320     } prop;
2321   } u;
2322 } OnigToken;
2323 
2324 
2325 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2326 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2327 {
2328   int low, up, syn_allow, non_low = 0;
2329   int r = 0;
2330   OnigCodePoint c;
2331   OnigEncoding enc = env->enc;
2332   UChar* p = *src;
2333   PFETCH_READY;
2334 
2335   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2336 
2337   if (PEND) {
2338     if (syn_allow)
2339       return 1;  /* "....{" : OK! */
2340     else
2341       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
2342   }
2343 
2344   if (! syn_allow) {
2345     c = PPEEK;
2346     if (c == ')' || c == '(' || c == '|') {
2347       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2348     }
2349   }
2350 
2351   low = onig_scan_unsigned_number(&p, end, env->enc);
2352   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2353   if (low > ONIG_MAX_REPEAT_NUM)
2354     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2355 
2356   if (p == *src) { /* can't read low */
2357     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2358       /* allow {,n} as {0,n} */
2359       low = 0;
2360       non_low = 1;
2361     }
2362     else
2363       goto invalid;
2364   }
2365 
2366   if (PEND) goto invalid;
2367   PFETCH(c);
2368   if (c == ',') {
2369     UChar* prev = p;
2370     up = onig_scan_unsigned_number(&p, end, env->enc);
2371     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2372     if (up > ONIG_MAX_REPEAT_NUM)
2373       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2374 
2375     if (p == prev) {
2376       if (non_low != 0)
2377 	goto invalid;
2378       up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */
2379     }
2380   }
2381   else {
2382     if (non_low != 0)
2383       goto invalid;
2384 
2385     PUNFETCH;
2386     up = low;  /* {n} : exact n times */
2387     r = 2;     /* fixed */
2388   }
2389 
2390   if (PEND) goto invalid;
2391   PFETCH(c);
2392   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2393     if (c != MC_ESC(env->syntax)) goto invalid;
2394     PFETCH(c);
2395   }
2396   if (c != '}') goto invalid;
2397 
2398   if (!IS_REPEAT_INFINITE(up) && low > up) {
2399     return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2400   }
2401 
2402   tok->type = TK_INTERVAL;
2403   tok->u.repeat.lower = low;
2404   tok->u.repeat.upper = up;
2405   *src = p;
2406   return r; /* 0: normal {n,m}, 2: fixed {n} */
2407 
2408  invalid:
2409   if (syn_allow)
2410     return 1;  /* OK */
2411   else
2412     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2413 }
2414 
2415 /* \M-, \C-, \c, or \... */
2416 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2417 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2418 {
2419   int v;
2420   OnigCodePoint c;
2421   OnigEncoding enc = env->enc;
2422   UChar* p = *src;
2423 
2424   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2425 
2426   PFETCH_S(c);
2427   switch (c) {
2428   case 'M':
2429     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2430       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2431       PFETCH_S(c);
2432       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2433       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2434       PFETCH_S(c);
2435       if (c == MC_ESC(env->syntax)) {
2436         v = fetch_escaped_value(&p, end, env);
2437         if (v < 0) return v;
2438         c = (OnigCodePoint )v;
2439       }
2440       c = ((c & 0xff) | 0x80);
2441     }
2442     else
2443       goto backslash;
2444     break;
2445 
2446   case 'C':
2447     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2448       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2449       PFETCH_S(c);
2450       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2451       goto control;
2452     }
2453     else
2454       goto backslash;
2455 
2456   case 'c':
2457     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2458     control:
2459       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2460       PFETCH_S(c);
2461       if (c == '?') {
2462         c = 0177;
2463       }
2464       else {
2465         if (c == MC_ESC(env->syntax)) {
2466           v = fetch_escaped_value(&p, end, env);
2467           if (v < 0) return v;
2468           c = (OnigCodePoint )v;
2469         }
2470         c &= 0x9f;
2471       }
2472       break;
2473     }
2474     /* fall through */
2475 
2476   default:
2477     {
2478     backslash:
2479       c = conv_backslash_value(c, env);
2480     }
2481     break;
2482   }
2483 
2484   *src = p;
2485   return c;
2486 }
2487 
2488 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2489 
2490 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2491 get_name_end_code_point(OnigCodePoint start)
2492 {
2493   switch (start) {
2494   case '<':  return (OnigCodePoint )'>'; break;
2495   case '\'': return (OnigCodePoint )'\''; break;
2496   default:
2497     break;
2498   }
2499 
2500   return (OnigCodePoint )0;
2501 }
2502 
2503 #ifdef USE_NAMED_GROUP
2504 #ifdef USE_BACKREF_WITH_LEVEL
2505 /*
2506    \k<name+n>, \k<name-n>
2507    \k<num+n>,  \k<num-n>
2508    \k<-num+n>, \k<-num-n>
2509 */
2510 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2511 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2512 		      UChar** rname_end, ScanEnv* env,
2513 		      int* rback_num, int* rlevel)
2514 {
2515   int r, sign, is_num, exist_level;
2516   OnigCodePoint end_code;
2517   OnigCodePoint c = 0;
2518   OnigEncoding enc = env->enc;
2519   UChar *name_end;
2520   UChar *pnum_head;
2521   UChar *p = *src;
2522   PFETCH_READY;
2523 
2524   *rback_num = 0;
2525   is_num = exist_level = 0;
2526   sign = 1;
2527   pnum_head = *src;
2528 
2529   end_code = get_name_end_code_point(start_code);
2530 
2531   name_end = end;
2532   r = 0;
2533   if (PEND) {
2534     return ONIGERR_EMPTY_GROUP_NAME;
2535   }
2536   else {
2537     PFETCH(c);
2538     if (c == end_code)
2539       return ONIGERR_EMPTY_GROUP_NAME;
2540 
2541     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2542       is_num = 1;
2543     }
2544     else if (c == '-') {
2545       is_num = 2;
2546       sign = -1;
2547       pnum_head = p;
2548     }
2549     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2550       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2551     }
2552   }
2553 
2554   while (!PEND) {
2555     name_end = p;
2556     PFETCH(c);
2557     if (c == end_code || c == ')' || c == '+' || c == '-') {
2558       if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2559       break;
2560     }
2561 
2562     if (is_num != 0) {
2563       if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2564         is_num = 1;
2565       }
2566       else {
2567         r = ONIGERR_INVALID_GROUP_NAME;
2568         is_num = 0;
2569       }
2570     }
2571     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2572       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2573     }
2574   }
2575 
2576   if (r == 0 && c != end_code) {
2577     if (c == '+' || c == '-') {
2578       int level;
2579       int flag = (c == '-' ? -1 : 1);
2580 
2581       PFETCH(c);
2582       if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2583       PUNFETCH;
2584       level = onig_scan_unsigned_number(&p, end, enc);
2585       if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2586       *rlevel = (level * flag);
2587       exist_level = 1;
2588 
2589       PFETCH(c);
2590       if (c == end_code)
2591 	goto end;
2592     }
2593 
2594   err:
2595     r = ONIGERR_INVALID_GROUP_NAME;
2596     name_end = end;
2597   }
2598 
2599  end:
2600   if (r == 0) {
2601     if (is_num != 0) {
2602       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2603       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2604       else if (*rback_num == 0) goto err;
2605 
2606       *rback_num *= sign;
2607     }
2608 
2609     *rname_end = name_end;
2610     *src = p;
2611     return (exist_level ? 1 : 0);
2612   }
2613   else {
2614     onig_scan_env_set_error_string(env, r, *src, name_end);
2615     return r;
2616   }
2617 }
2618 #endif /* USE_BACKREF_WITH_LEVEL */
2619 
2620 /*
2621   def: 0 -> define name    (don't allow number name)
2622        1 -> reference name (allow number name)
2623 */
2624 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2625 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2626 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2627 {
2628   int r, is_num, sign;
2629   OnigCodePoint end_code;
2630   OnigCodePoint c = 0;
2631   OnigEncoding enc = env->enc;
2632   UChar *name_end;
2633   UChar *pnum_head;
2634   UChar *p = *src;
2635 
2636   *rback_num = 0;
2637 
2638   end_code = get_name_end_code_point(start_code);
2639 
2640   name_end = end;
2641   pnum_head = *src;
2642   r = 0;
2643   is_num = 0;
2644   sign = 1;
2645   if (PEND) {
2646     return ONIGERR_EMPTY_GROUP_NAME;
2647   }
2648   else {
2649     PFETCH_S(c);
2650     if (c == end_code)
2651       return ONIGERR_EMPTY_GROUP_NAME;
2652 
2653     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2654       if (ref == 1)
2655         is_num = 1;
2656       else {
2657         r = ONIGERR_INVALID_GROUP_NAME;
2658         is_num = 0;
2659       }
2660     }
2661     else if (c == '-') {
2662       if (ref == 1) {
2663         is_num = 2;
2664         sign = -1;
2665         pnum_head = p;
2666       }
2667       else {
2668         r = ONIGERR_INVALID_GROUP_NAME;
2669         is_num = 0;
2670       }
2671     }
2672     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2673       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2674     }
2675   }
2676 
2677   if (r == 0) {
2678     while (!PEND) {
2679       name_end = p;
2680       PFETCH_S(c);
2681       if (c == end_code || c == ')') {
2682         if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2683         break;
2684       }
2685 
2686       if (is_num != 0) {
2687         if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2688           is_num = 1;
2689         }
2690         else {
2691           if (!ONIGENC_IS_CODE_WORD(enc, c))
2692             r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2693           else
2694             r = ONIGERR_INVALID_GROUP_NAME;
2695           is_num = 0;
2696         }
2697       }
2698       else {
2699         if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2700           r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2701         }
2702       }
2703     }
2704 
2705     if (c != end_code) {
2706       r = ONIGERR_INVALID_GROUP_NAME;
2707       name_end = end;
2708     }
2709 
2710     if (is_num != 0) {
2711       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2712       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2713       else if (*rback_num == 0) {
2714         r = ONIGERR_INVALID_GROUP_NAME;
2715         goto err;
2716       }
2717 
2718       *rback_num *= sign;
2719     }
2720 
2721     *rname_end = name_end;
2722     *src = p;
2723     return 0;
2724   }
2725   else {
2726     while (!PEND) {
2727       name_end = p;
2728       PFETCH_S(c);
2729       if (c == end_code || c == ')')
2730         break;
2731     }
2732     if (PEND)
2733       name_end = end;
2734 
2735   err:
2736     onig_scan_env_set_error_string(env, r, *src, name_end);
2737     return r;
2738   }
2739 }
2740 #else
2741 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2742 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2743 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2744 {
2745   int r, is_num, sign;
2746   OnigCodePoint end_code;
2747   OnigCodePoint c = 0;
2748   UChar *name_end;
2749   OnigEncoding enc = env->enc;
2750   UChar *pnum_head;
2751   UChar *p = *src;
2752   PFETCH_READY;
2753 
2754   *rback_num = 0;
2755 
2756   end_code = get_name_end_code_point(start_code);
2757 
2758   *rname_end = name_end = end;
2759   r = 0;
2760   pnum_head = *src;
2761   is_num = 0;
2762   sign = 1;
2763 
2764   if (PEND) {
2765     return ONIGERR_EMPTY_GROUP_NAME;
2766   }
2767   else {
2768     PFETCH(c);
2769     if (c == end_code)
2770       return ONIGERR_EMPTY_GROUP_NAME;
2771 
2772     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2773       is_num = 1;
2774     }
2775     else if (c == '-') {
2776       is_num = 2;
2777       sign = -1;
2778       pnum_head = p;
2779     }
2780     else {
2781       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2782     }
2783   }
2784 
2785   while (!PEND) {
2786     name_end = p;
2787 
2788     PFETCH(c);
2789     if (c == end_code || c == ')') break;
2790     if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2791       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2792   }
2793   if (r == 0 && c != end_code) {
2794     r = ONIGERR_INVALID_GROUP_NAME;
2795     name_end = end;
2796   }
2797 
2798   if (r == 0) {
2799     *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2800     if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2801     else if (*rback_num == 0) {
2802       r = ONIGERR_INVALID_GROUP_NAME;
2803       goto err;
2804     }
2805     *rback_num *= sign;
2806 
2807     *rname_end = name_end;
2808     *src = p;
2809     return 0;
2810   }
2811   else {
2812   err:
2813     onig_scan_env_set_error_string(env, r, *src, name_end);
2814     return r;
2815   }
2816 }
2817 #endif /* USE_NAMED_GROUP */
2818 
2819 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2820 CC_ESC_WARN(ScanEnv* env, UChar *c)
2821 {
2822   if (onig_warn == onig_null_warn) return ;
2823 
2824   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2825       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2826     UChar buf[WARN_BUFSIZE];
2827     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2828 		env->pattern, env->pattern_end,
2829                 (UChar* )"character class has '%s' without escape", c);
2830     (*onig_warn)((char* )buf);
2831   }
2832 }
2833 
2834 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2835 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2836 {
2837   if (onig_warn == onig_null_warn) return ;
2838 
2839   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2840     UChar buf[WARN_BUFSIZE];
2841     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2842 		(env)->pattern, (env)->pattern_end,
2843 		(UChar* )"regular expression has '%s' without escape", c);
2844     (*onig_warn)((char* )buf);
2845   }
2846 }
2847 
2848 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2849 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2850 		  UChar **next, OnigEncoding enc)
2851 {
2852   int i;
2853   OnigCodePoint x;
2854   UChar *q;
2855   UChar *p = from;
2856 
2857   while (p < to) {
2858     x = ONIGENC_MBC_TO_CODE(enc, p, to);
2859     q = p + enclen(enc, p);
2860     if (x == s[0]) {
2861       for (i = 1; i < n && q < to; i++) {
2862 	x = ONIGENC_MBC_TO_CODE(enc, q, to);
2863 	if (x != s[i]) break;
2864 	q += enclen(enc, q);
2865       }
2866       if (i >= n) {
2867 	if (IS_NOT_NULL(next))
2868 	  *next = q;
2869 	return p;
2870       }
2871     }
2872     p = q;
2873   }
2874   return NULL_UCHARP;
2875 }
2876 
2877 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2878 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2879 		 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2880 {
2881   int i, in_esc;
2882   OnigCodePoint x;
2883   UChar *q;
2884   UChar *p = from;
2885 
2886   in_esc = 0;
2887   while (p < to) {
2888     if (in_esc) {
2889       in_esc = 0;
2890       p += enclen(enc, p);
2891     }
2892     else {
2893       x = ONIGENC_MBC_TO_CODE(enc, p, to);
2894       q = p + enclen(enc, p);
2895       if (x == s[0]) {
2896 	for (i = 1; i < n && q < to; i++) {
2897 	  x = ONIGENC_MBC_TO_CODE(enc, q, to);
2898 	  if (x != s[i]) break;
2899 	  q += enclen(enc, q);
2900 	}
2901 	if (i >= n) return 1;
2902 	p += enclen(enc, p);
2903       }
2904       else {
2905 	x = ONIGENC_MBC_TO_CODE(enc, p, to);
2906 	if (x == bad) return 0;
2907 	else if (x == MC_ESC(syn)) in_esc = 1;
2908 	p = q;
2909       }
2910     }
2911   }
2912   return 0;
2913 }
2914 
2915 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2916 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2917 {
2918   int num;
2919   OnigCodePoint c, c2;
2920   OnigSyntaxType* syn = env->syntax;
2921   OnigEncoding enc = env->enc;
2922   UChar* prev;
2923   UChar* p = *src;
2924   PFETCH_READY;
2925 
2926   if (PEND) {
2927     tok->type = TK_EOT;
2928     return tok->type;
2929   }
2930 
2931   PFETCH(c);
2932   tok->type = TK_CHAR;
2933   tok->base = 0;
2934   tok->u.c  = c;
2935   tok->escaped = 0;
2936 
2937   if (c == ']') {
2938     tok->type = TK_CC_CLOSE;
2939   }
2940   else if (c == '-') {
2941     tok->type = TK_CC_RANGE;
2942   }
2943   else if (c == MC_ESC(syn)) {
2944     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2945       goto end;
2946 
2947     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2948 
2949     PFETCH(c);
2950     tok->escaped = 1;
2951     tok->u.c = c;
2952     switch (c) {
2953     case 'w':
2954       tok->type = TK_CHAR_TYPE;
2955       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2956       tok->u.prop.not   = 0;
2957       break;
2958     case 'W':
2959       tok->type = TK_CHAR_TYPE;
2960       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2961       tok->u.prop.not   = 1;
2962       break;
2963     case 'd':
2964       tok->type = TK_CHAR_TYPE;
2965       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2966       tok->u.prop.not   = 0;
2967       break;
2968     case 'D':
2969       tok->type = TK_CHAR_TYPE;
2970       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2971       tok->u.prop.not   = 1;
2972       break;
2973     case 's':
2974       tok->type = TK_CHAR_TYPE;
2975       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2976       tok->u.prop.not   = 0;
2977       break;
2978     case 'S':
2979       tok->type = TK_CHAR_TYPE;
2980       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2981       tok->u.prop.not   = 1;
2982       break;
2983     case 'h':
2984       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2985       tok->type = TK_CHAR_TYPE;
2986       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2987       tok->u.prop.not   = 0;
2988       break;
2989     case 'H':
2990       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2991       tok->type = TK_CHAR_TYPE;
2992       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2993       tok->u.prop.not   = 1;
2994       break;
2995 
2996     case 'p':
2997     case 'P':
2998       c2 = PPEEK;
2999       if (c2 == '{' &&
3000 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3001 	PINC;
3002 	tok->type = TK_CHAR_PROPERTY;
3003 	tok->u.prop.not = (c == 'P' ? 1 : 0);
3004 
3005 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3006 	  PFETCH(c2);
3007 	  if (c2 == '^') {
3008 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3009 	  }
3010 	  else
3011 	    PUNFETCH;
3012 	}
3013       }
3014       break;
3015 
3016     case 'x':
3017       if (PEND) break;
3018 
3019       prev = p;
3020       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3021 	PINC;
3022 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3023 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3024 	if (!PEND) {
3025           c2 = PPEEK;
3026           if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3027             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3028         }
3029 
3030 	if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3031 	  PINC;
3032 	  tok->type   = TK_CODE_POINT;
3033 	  tok->base   = 16;
3034 	  tok->u.code = (OnigCodePoint )num;
3035 	}
3036 	else {
3037 	  /* can't read nothing or invalid format */
3038 	  p = prev;
3039 	}
3040       }
3041       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3042 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3043 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3044 	if (p == prev) {  /* can't read nothing. */
3045 	  num = 0; /* but, it's not error */
3046 	}
3047 	tok->type = TK_RAW_BYTE;
3048 	tok->base = 16;
3049 	tok->u.c  = num;
3050       }
3051       break;
3052 
3053     case 'u':
3054       if (PEND) break;
3055 
3056       prev = p;
3057       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3058 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3059 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3060 	if (p == prev) {  /* can't read nothing. */
3061 	  num = 0; /* but, it's not error */
3062 	}
3063 	tok->type   = TK_CODE_POINT;
3064 	tok->base   = 16;
3065 	tok->u.code = (OnigCodePoint )num;
3066       }
3067       break;
3068 
3069     case '0':
3070     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3071       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3072 	PUNFETCH;
3073 	prev = p;
3074 	num = scan_unsigned_octal_number(&p, end, 3, enc);
3075 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3076 	if (p == prev) {  /* can't read nothing. */
3077 	  num = 0; /* but, it's not error */
3078 	}
3079 	tok->type = TK_RAW_BYTE;
3080 	tok->base = 8;
3081 	tok->u.c  = num;
3082       }
3083       break;
3084 
3085     default:
3086       PUNFETCH;
3087       num = fetch_escaped_value(&p, end, env);
3088       if (num < 0) return num;
3089       if (tok->u.c != num) {
3090 	tok->u.code = (OnigCodePoint )num;
3091 	tok->type   = TK_CODE_POINT;
3092       }
3093       break;
3094     }
3095   }
3096   else if (c == '[') {
3097     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3098       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3099       tok->backp = p; /* point at '[' is readed */
3100       PINC;
3101       if (str_exist_check_with_esc(send, 2, p, end,
3102                                    (OnigCodePoint )']', enc, syn)) {
3103 	tok->type = TK_POSIX_BRACKET_OPEN;
3104       }
3105       else {
3106 	PUNFETCH;
3107 	goto cc_in_cc;
3108       }
3109     }
3110     else {
3111     cc_in_cc:
3112       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3113 	tok->type = TK_CC_CC_OPEN;
3114       }
3115       else {
3116 	CC_ESC_WARN(env, (UChar* )"[");
3117       }
3118     }
3119   }
3120   else if (c == '&') {
3121     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3122 	!PEND && (PPEEK_IS('&'))) {
3123       PINC;
3124       tok->type = TK_CC_AND;
3125     }
3126   }
3127 
3128  end:
3129   *src = p;
3130   return tok->type;
3131 }
3132 
3133 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3134 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3135 {
3136   int r, num;
3137   OnigCodePoint c;
3138   OnigEncoding enc = env->enc;
3139   OnigSyntaxType* syn = env->syntax;
3140   UChar* prev;
3141   UChar* p = *src;
3142   PFETCH_READY;
3143 
3144  start:
3145   if (PEND) {
3146     tok->type = TK_EOT;
3147     return tok->type;
3148   }
3149 
3150   tok->type  = TK_STRING;
3151   tok->base  = 0;
3152   tok->backp = p;
3153 
3154   PFETCH(c);
3155   if (IS_MC_ESC_CODE(c, syn)) {
3156     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3157 
3158     tok->backp = p;
3159     PFETCH(c);
3160 
3161     tok->u.c = c;
3162     tok->escaped = 1;
3163     switch (c) {
3164     case '*':
3165       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3166       tok->type = TK_OP_REPEAT;
3167       tok->u.repeat.lower = 0;
3168       tok->u.repeat.upper = REPEAT_INFINITE;
3169       goto greedy_check;
3170       break;
3171 
3172     case '+':
3173       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3174       tok->type = TK_OP_REPEAT;
3175       tok->u.repeat.lower = 1;
3176       tok->u.repeat.upper = REPEAT_INFINITE;
3177       goto greedy_check;
3178       break;
3179 
3180     case '?':
3181       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3182       tok->type = TK_OP_REPEAT;
3183       tok->u.repeat.lower = 0;
3184       tok->u.repeat.upper = 1;
3185     greedy_check:
3186       if (!PEND && PPEEK_IS('?') &&
3187 	  IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3188 	PFETCH(c);
3189 	tok->u.repeat.greedy     = 0;
3190 	tok->u.repeat.possessive = 0;
3191       }
3192       else {
3193       possessive_check:
3194 	if (!PEND && PPEEK_IS('+') &&
3195 	    ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3196 	      tok->type != TK_INTERVAL)  ||
3197 	     (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3198 	      tok->type == TK_INTERVAL))) {
3199 	  PFETCH(c);
3200 	  tok->u.repeat.greedy     = 1;
3201 	  tok->u.repeat.possessive = 1;
3202 	}
3203 	else {
3204 	  tok->u.repeat.greedy     = 1;
3205 	  tok->u.repeat.possessive = 0;
3206 	}
3207       }
3208       break;
3209 
3210     case '{':
3211       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3212       r = fetch_range_quantifier(&p, end, tok, env);
3213       if (r < 0) return r;  /* error */
3214       if (r == 0) goto greedy_check;
3215       else if (r == 2) { /* {n} */
3216 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3217 	  goto possessive_check;
3218 
3219 	goto greedy_check;
3220       }
3221       /* r == 1 : normal char */
3222       break;
3223 
3224     case '|':
3225       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3226       tok->type = TK_ALT;
3227       break;
3228 
3229     case '(':
3230       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3231       tok->type = TK_SUBEXP_OPEN;
3232       break;
3233 
3234     case ')':
3235       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3236       tok->type = TK_SUBEXP_CLOSE;
3237       break;
3238 
3239     case 'w':
3240       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3241       tok->type = TK_CHAR_TYPE;
3242       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3243       tok->u.prop.not   = 0;
3244       break;
3245 
3246     case 'W':
3247       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3248       tok->type = TK_CHAR_TYPE;
3249       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3250       tok->u.prop.not   = 1;
3251       break;
3252 
3253     case 'b':
3254       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3255       tok->type = TK_ANCHOR;
3256       tok->u.anchor = ANCHOR_WORD_BOUND;
3257       break;
3258 
3259     case 'B':
3260       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3261       tok->type = TK_ANCHOR;
3262       tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3263       break;
3264 
3265 #ifdef USE_WORD_BEGIN_END
3266     case '<':
3267       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3268       tok->type = TK_ANCHOR;
3269       tok->u.anchor = ANCHOR_WORD_BEGIN;
3270       break;
3271 
3272     case '>':
3273       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3274       tok->type = TK_ANCHOR;
3275       tok->u.anchor = ANCHOR_WORD_END;
3276       break;
3277 #endif
3278 
3279     case 's':
3280       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3281       tok->type = TK_CHAR_TYPE;
3282       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3283       tok->u.prop.not   = 0;
3284       break;
3285 
3286     case 'S':
3287       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3288       tok->type = TK_CHAR_TYPE;
3289       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3290       tok->u.prop.not   = 1;
3291       break;
3292 
3293     case 'd':
3294       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3295       tok->type = TK_CHAR_TYPE;
3296       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3297       tok->u.prop.not   = 0;
3298       break;
3299 
3300     case 'D':
3301       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3302       tok->type = TK_CHAR_TYPE;
3303       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3304       tok->u.prop.not   = 1;
3305       break;
3306 
3307     case 'h':
3308       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3309       tok->type = TK_CHAR_TYPE;
3310       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3311       tok->u.prop.not   = 0;
3312       break;
3313 
3314     case 'H':
3315       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3316       tok->type = TK_CHAR_TYPE;
3317       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3318       tok->u.prop.not   = 1;
3319       break;
3320 
3321     case 'A':
3322       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3323     begin_buf:
3324       tok->type = TK_ANCHOR;
3325       tok->u.subtype = ANCHOR_BEGIN_BUF;
3326       break;
3327 
3328     case 'Z':
3329       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3330       tok->type = TK_ANCHOR;
3331       tok->u.subtype = ANCHOR_SEMI_END_BUF;
3332       break;
3333 
3334     case 'z':
3335       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3336     end_buf:
3337       tok->type = TK_ANCHOR;
3338       tok->u.subtype = ANCHOR_END_BUF;
3339       break;
3340 
3341     case 'G':
3342       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3343       tok->type = TK_ANCHOR;
3344       tok->u.subtype = ANCHOR_BEGIN_POSITION;
3345       break;
3346 
3347     case '`':
3348       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3349       goto begin_buf;
3350       break;
3351 
3352     case '\'':
3353       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3354       goto end_buf;
3355       break;
3356 
3357     case 'x':
3358       if (PEND) break;
3359 
3360       prev = p;
3361       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3362 	PINC;
3363 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3364 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3365 	if (!PEND) {
3366           if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3367             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3368         }
3369 
3370 	if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3371 	  PINC;
3372 	  tok->type   = TK_CODE_POINT;
3373 	  tok->u.code = (OnigCodePoint )num;
3374 	}
3375 	else {
3376 	  /* can't read nothing or invalid format */
3377 	  p = prev;
3378 	}
3379       }
3380       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3381 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3382 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3383 	if (p == prev) {  /* can't read nothing. */
3384 	  num = 0; /* but, it's not error */
3385 	}
3386 	tok->type = TK_RAW_BYTE;
3387 	tok->base = 16;
3388 	tok->u.c  = num;
3389       }
3390       break;
3391 
3392     case 'u':
3393       if (PEND) break;
3394 
3395       prev = p;
3396       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3397 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3398 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3399 	if (p == prev) {  /* can't read nothing. */
3400 	  num = 0; /* but, it's not error */
3401 	}
3402 	tok->type   = TK_CODE_POINT;
3403 	tok->base   = 16;
3404 	tok->u.code = (OnigCodePoint )num;
3405       }
3406       break;
3407 
3408     case '1': case '2': case '3': case '4':
3409     case '5': case '6': case '7': case '8': case '9':
3410       PUNFETCH;
3411       prev = p;
3412       num = onig_scan_unsigned_number(&p, end, enc);
3413       if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3414         goto skip_backref;
3415       }
3416 
3417       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3418 	  (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3419 	if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3420 	  if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3421 	    return ONIGERR_INVALID_BACKREF;
3422 	}
3423 
3424 	tok->type = TK_BACKREF;
3425 	tok->u.backref.num     = 1;
3426 	tok->u.backref.ref1    = num;
3427 	tok->u.backref.by_name = 0;
3428 #ifdef USE_BACKREF_WITH_LEVEL
3429 	tok->u.backref.exist_level = 0;
3430 #endif
3431 	break;
3432       }
3433 
3434     skip_backref:
3435       if (c == '8' || c == '9') {
3436 	/* normal char */
3437 	p = prev; PINC;
3438 	break;
3439       }
3440 
3441       p = prev;
3442       /* fall through */
3443     case '0':
3444       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3445 	prev = p;
3446 	num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3447 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3448 	if (p == prev) {  /* can't read nothing. */
3449 	  num = 0; /* but, it's not error */
3450 	}
3451 	tok->type = TK_RAW_BYTE;
3452 	tok->base = 8;
3453 	tok->u.c  = num;
3454       }
3455       else if (c != '0') {
3456 	PINC;
3457       }
3458       break;
3459 
3460 #ifdef USE_NAMED_GROUP
3461     case 'k':
3462       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3463 	PFETCH(c);
3464 	if (c == '<' || c == '\'') {
3465 	  UChar* name_end;
3466 	  int* backs;
3467 	  int back_num;
3468 
3469 	  prev = p;
3470 
3471 #ifdef USE_BACKREF_WITH_LEVEL
3472 	  name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3473 	  r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3474 				    env, &back_num, &tok->u.backref.level);
3475 	  if (r == 1) tok->u.backref.exist_level = 1;
3476 	  else        tok->u.backref.exist_level = 0;
3477 #else
3478 	  r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3479 #endif
3480 	  if (r < 0) return r;
3481 
3482 	  if (back_num != 0) {
3483 	    if (back_num < 0) {
3484 	      back_num = BACKREF_REL_TO_ABS(back_num, env);
3485 	      if (back_num <= 0)
3486 		return ONIGERR_INVALID_BACKREF;
3487 	    }
3488 
3489 	    if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3490 	      if (back_num > env->num_mem ||
3491 		  IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3492 		return ONIGERR_INVALID_BACKREF;
3493 	    }
3494 	    tok->type = TK_BACKREF;
3495 	    tok->u.backref.by_name = 0;
3496 	    tok->u.backref.num  = 1;
3497 	    tok->u.backref.ref1 = back_num;
3498 	  }
3499 	  else {
3500 	    num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3501 	    if (num <= 0) {
3502 	      onig_scan_env_set_error_string(env,
3503 			     ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3504 	      return ONIGERR_UNDEFINED_NAME_REFERENCE;
3505 	    }
3506 	    if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3507 	      int i;
3508 	      for (i = 0; i < num; i++) {
3509 		if (backs[i] > env->num_mem ||
3510 		    IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3511 		  return ONIGERR_INVALID_BACKREF;
3512 	      }
3513 	    }
3514 
3515 	    tok->type = TK_BACKREF;
3516 	    tok->u.backref.by_name = 1;
3517 	    if (num == 1) {
3518 	      tok->u.backref.num  = 1;
3519 	      tok->u.backref.ref1 = backs[0];
3520 	    }
3521 	    else {
3522 	      tok->u.backref.num  = num;
3523 	      tok->u.backref.refs = backs;
3524 	    }
3525 	  }
3526 	}
3527 	else
3528 	  PUNFETCH;
3529       }
3530       break;
3531 #endif
3532 
3533 #ifdef USE_SUBEXP_CALL
3534     case 'g':
3535       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3536 	PFETCH(c);
3537 	if (c == '<' || c == '\'') {
3538 	  int gnum;
3539 	  UChar* name_end;
3540 
3541 	  prev = p;
3542 	  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3543 	  if (r < 0) return r;
3544 
3545 	  tok->type = TK_CALL;
3546 	  tok->u.call.name     = prev;
3547 	  tok->u.call.name_end = name_end;
3548 	  tok->u.call.gnum     = gnum;
3549 	}
3550 	else
3551 	  PUNFETCH;
3552       }
3553       break;
3554 #endif
3555 
3556     case 'Q':
3557       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3558 	tok->type = TK_QUOTE_OPEN;
3559       }
3560       break;
3561 
3562     case 'p':
3563     case 'P':
3564       if (PPEEK_IS('{') &&
3565 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3566 	PINC;
3567 	tok->type = TK_CHAR_PROPERTY;
3568 	tok->u.prop.not = (c == 'P' ? 1 : 0);
3569 
3570 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3571 	  PFETCH(c);
3572 	  if (c == '^') {
3573 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3574 	  }
3575 	  else
3576 	    PUNFETCH;
3577 	}
3578       }
3579       break;
3580 
3581     default:
3582       PUNFETCH;
3583       num = fetch_escaped_value(&p, end, env);
3584       if (num < 0) return num;
3585       /* set_raw: */
3586       if (tok->u.c != num) {
3587 	tok->type = TK_CODE_POINT;
3588 	tok->u.code = (OnigCodePoint )num;
3589       }
3590       else { /* string */
3591 	p = tok->backp + enclen(enc, tok->backp);
3592       }
3593       break;
3594     }
3595   }
3596   else {
3597     tok->u.c = c;
3598     tok->escaped = 0;
3599 
3600 #ifdef USE_VARIABLE_META_CHARS
3601     if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3602 	IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3603       if (c == MC_ANYCHAR(syn))
3604 	goto any_char;
3605       else if (c == MC_ANYTIME(syn))
3606 	goto anytime;
3607       else if (c == MC_ZERO_OR_ONE_TIME(syn))
3608 	goto zero_or_one_time;
3609       else if (c == MC_ONE_OR_MORE_TIME(syn))
3610 	goto one_or_more_time;
3611       else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3612 	tok->type = TK_ANYCHAR_ANYTIME;
3613 	goto out;
3614       }
3615     }
3616 #endif
3617 
3618     switch (c) {
3619     case '.':
3620       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3621 #ifdef USE_VARIABLE_META_CHARS
3622     any_char:
3623 #endif
3624       tok->type = TK_ANYCHAR;
3625       break;
3626 
3627     case '*':
3628       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3629 #ifdef USE_VARIABLE_META_CHARS
3630     anytime:
3631 #endif
3632       tok->type = TK_OP_REPEAT;
3633       tok->u.repeat.lower = 0;
3634       tok->u.repeat.upper = REPEAT_INFINITE;
3635       goto greedy_check;
3636       break;
3637 
3638     case '+':
3639       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3640 #ifdef USE_VARIABLE_META_CHARS
3641     one_or_more_time:
3642 #endif
3643       tok->type = TK_OP_REPEAT;
3644       tok->u.repeat.lower = 1;
3645       tok->u.repeat.upper = REPEAT_INFINITE;
3646       goto greedy_check;
3647       break;
3648 
3649     case '?':
3650       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3651 #ifdef USE_VARIABLE_META_CHARS
3652     zero_or_one_time:
3653 #endif
3654       tok->type = TK_OP_REPEAT;
3655       tok->u.repeat.lower = 0;
3656       tok->u.repeat.upper = 1;
3657       goto greedy_check;
3658       break;
3659 
3660     case '{':
3661       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3662       r = fetch_range_quantifier(&p, end, tok, env);
3663       if (r < 0) return r;  /* error */
3664       if (r == 0) goto greedy_check;
3665       else if (r == 2) { /* {n} */
3666 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3667 	  goto possessive_check;
3668 
3669 	goto greedy_check;
3670       }
3671       /* r == 1 : normal char */
3672       break;
3673 
3674     case '|':
3675       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3676       tok->type = TK_ALT;
3677       break;
3678 
3679     case '(':
3680       if (PPEEK_IS('?') &&
3681           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3682         PINC;
3683         if (PPEEK_IS('#')) {
3684           PFETCH(c);
3685           while (1) {
3686             if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3687             PFETCH(c);
3688             if (c == MC_ESC(syn)) {
3689               if (!PEND) PFETCH(c);
3690             }
3691             else {
3692               if (c == ')') break;
3693             }
3694           }
3695           goto start;
3696         }
3697         PUNFETCH;
3698       }
3699 
3700       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3701       tok->type = TK_SUBEXP_OPEN;
3702       break;
3703 
3704     case ')':
3705       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3706       tok->type = TK_SUBEXP_CLOSE;
3707       break;
3708 
3709     case '^':
3710       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3711       tok->type = TK_ANCHOR;
3712       tok->u.subtype = (IS_SINGLELINE(env->option)
3713 			? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3714       break;
3715 
3716     case '$':
3717       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3718       tok->type = TK_ANCHOR;
3719       tok->u.subtype = (IS_SINGLELINE(env->option)
3720 			? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3721       break;
3722 
3723     case '[':
3724       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3725       tok->type = TK_CC_OPEN;
3726       break;
3727 
3728     case ']':
3729       if (*src > env->pattern)   /* /].../ is allowed. */
3730 	CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3731       break;
3732 
3733     case '#':
3734       if (IS_EXTEND(env->option)) {
3735 	while (!PEND) {
3736 	  PFETCH(c);
3737 	  if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3738 	    break;
3739 	}
3740 	goto start;
3741 	break;
3742       }
3743       break;
3744 
3745     case ' ': case '\t': case '\n': case '\r': case '\f':
3746       if (IS_EXTEND(env->option))
3747 	goto start;
3748       break;
3749 
3750     default:
3751       /* string */
3752       break;
3753     }
3754   }
3755 
3756 #ifdef USE_VARIABLE_META_CHARS
3757  out:
3758 #endif
3759   *src = p;
3760   return tok->type;
3761 }
3762 
3763 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3764 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3765 			 OnigEncoding enc ARG_UNUSED,
3766                          OnigCodePoint sb_out, const OnigCodePoint mbr[])
3767 {
3768   int i, r;
3769   OnigCodePoint j;
3770 
3771   int n = ONIGENC_CODE_RANGE_NUM(mbr);
3772 
3773   if (not == 0) {
3774     for (i = 0; i < n; i++) {
3775       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
3776            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3777 	if (j >= sb_out) {
3778 	  if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3779 	  else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3780 	    r = add_code_range_to_buf(&(cc->mbuf), j,
3781 				      ONIGENC_CODE_RANGE_TO(mbr, i));
3782 	    if (r != 0) return r;
3783 	    i++;
3784 	  }
3785 
3786 	  goto sb_end;
3787 	}
3788         BITSET_SET_BIT(cc->bs, j);
3789       }
3790     }
3791 
3792   sb_end:
3793     for ( ; i < n; i++) {
3794       r = add_code_range_to_buf(&(cc->mbuf),
3795                                 ONIGENC_CODE_RANGE_FROM(mbr, i),
3796                                 ONIGENC_CODE_RANGE_TO(mbr, i));
3797       if (r != 0) return r;
3798     }
3799   }
3800   else {
3801     OnigCodePoint prev = 0;
3802 
3803     for (i = 0; i < n; i++) {
3804       for (j = prev;
3805 	   j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3806 	if (j >= sb_out) {
3807 	  goto sb_end2;
3808 	}
3809 	BITSET_SET_BIT(cc->bs, j);
3810       }
3811       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3812     }
3813     for (j = prev; j < sb_out; j++) {
3814       BITSET_SET_BIT(cc->bs, j);
3815     }
3816 
3817   sb_end2:
3818     prev = sb_out;
3819 
3820     for (i = 0; i < n; i++) {
3821       if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3822 	r = add_code_range_to_buf(&(cc->mbuf), prev,
3823                                   ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3824 	if (r != 0) return r;
3825       }
3826       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3827     }
3828     if (prev < 0x7fffffff) {
3829       r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3830       if (r != 0) return r;
3831     }
3832   }
3833 
3834   return 0;
3835 }
3836 
3837 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3838 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3839 {
3840   int c, r;
3841   const OnigCodePoint *ranges;
3842   OnigCodePoint sb_out;
3843   OnigEncoding enc = env->enc;
3844 
3845   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3846   if (r == 0) {
3847     return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3848   }
3849   else if (r != ONIG_NO_SUPPORT_CONFIG) {
3850     return r;
3851   }
3852 
3853   r = 0;
3854   switch (ctype) {
3855   case ONIGENC_CTYPE_ALPHA:
3856   case ONIGENC_CTYPE_BLANK:
3857   case ONIGENC_CTYPE_CNTRL:
3858   case ONIGENC_CTYPE_DIGIT:
3859   case ONIGENC_CTYPE_LOWER:
3860   case ONIGENC_CTYPE_PUNCT:
3861   case ONIGENC_CTYPE_SPACE:
3862   case ONIGENC_CTYPE_UPPER:
3863   case ONIGENC_CTYPE_XDIGIT:
3864   case ONIGENC_CTYPE_ASCII:
3865   case ONIGENC_CTYPE_ALNUM:
3866     if (not != 0) {
3867       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3868 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3869 	  BITSET_SET_BIT(cc->bs, c);
3870       }
3871       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3872     }
3873     else {
3874       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3875 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3876 	  BITSET_SET_BIT(cc->bs, c);
3877       }
3878     }
3879     break;
3880 
3881   case ONIGENC_CTYPE_GRAPH:
3882   case ONIGENC_CTYPE_PRINT:
3883     if (not != 0) {
3884       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3885 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3886 	  BITSET_SET_BIT(cc->bs, c);
3887       }
3888     }
3889     else {
3890       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3891 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3892 	  BITSET_SET_BIT(cc->bs, c);
3893       }
3894       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3895     }
3896     break;
3897 
3898   case ONIGENC_CTYPE_WORD:
3899     if (not == 0) {
3900       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3901 	if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3902       }
3903       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3904     }
3905     else {
3906       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3907         if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3908 	    && ! ONIGENC_IS_CODE_WORD(enc, c))
3909 	  BITSET_SET_BIT(cc->bs, c);
3910       }
3911     }
3912     break;
3913 
3914   default:
3915     return ONIGERR_PARSER_BUG;
3916     break;
3917   }
3918 
3919   return r;
3920 }
3921 
3922 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3923 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3924 {
3925 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20
3926 #define POSIX_BRACKET_NAME_MIN_LEN         4
3927 
3928   static PosixBracketEntryType PBS[] = {
3929     { (UChar* )"alnum",  ONIGENC_CTYPE_ALNUM,  5 },
3930     { (UChar* )"alpha",  ONIGENC_CTYPE_ALPHA,  5 },
3931     { (UChar* )"blank",  ONIGENC_CTYPE_BLANK,  5 },
3932     { (UChar* )"cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
3933     { (UChar* )"digit",  ONIGENC_CTYPE_DIGIT,  5 },
3934     { (UChar* )"graph",  ONIGENC_CTYPE_GRAPH,  5 },
3935     { (UChar* )"lower",  ONIGENC_CTYPE_LOWER,  5 },
3936     { (UChar* )"print",  ONIGENC_CTYPE_PRINT,  5 },
3937     { (UChar* )"punct",  ONIGENC_CTYPE_PUNCT,  5 },
3938     { (UChar* )"space",  ONIGENC_CTYPE_SPACE,  5 },
3939     { (UChar* )"upper",  ONIGENC_CTYPE_UPPER,  5 },
3940     { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3941     { (UChar* )"ascii",  ONIGENC_CTYPE_ASCII,  5 },
3942     { (UChar* )"word",   ONIGENC_CTYPE_WORD,   4 },
3943     { (UChar* )NULL,     -1, 0 }
3944   };
3945 
3946   PosixBracketEntryType *pb;
3947   int not, i, r;
3948   OnigCodePoint c;
3949   OnigEncoding enc = env->enc;
3950   UChar *p = *src;
3951 
3952   if (PPEEK_IS('^')) {
3953     PINC_S;
3954     not = 1;
3955   }
3956   else
3957     not = 0;
3958 
3959   if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3960     goto not_posix_bracket;
3961 
3962   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3963     if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3964       p = (UChar* )onigenc_step(enc, p, end, pb->len);
3965       if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3966         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3967 
3968       r = add_ctype_to_cc(cc, pb->ctype, not, env);
3969       if (r != 0) return r;
3970 
3971       PINC_S; PINC_S;
3972       *src = p;
3973       return 0;
3974     }
3975   }
3976 
3977  not_posix_bracket:
3978   c = 0;
3979   i = 0;
3980   while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3981     PINC_S;
3982     if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3983   }
3984   if (c == ':' && ! PEND) {
3985     PINC_S;
3986     if (! PEND) {
3987       PFETCH_S(c);
3988       if (c == ']')
3989         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3990     }
3991   }
3992 
3993   return 1;  /* 1: is not POSIX bracket, but no error. */
3994 }
3995 
3996 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3997 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3998 {
3999   int r;
4000   OnigCodePoint c;
4001   OnigEncoding enc = env->enc;
4002   UChar *prev, *start, *p = *src;
4003 
4004   r = 0;
4005   start = prev = p;
4006 
4007   while (!PEND) {
4008     prev = p;
4009     PFETCH_S(c);
4010     if (c == '}') {
4011       r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4012       if (r < 0) break;
4013 
4014       *src = p;
4015       return r;
4016     }
4017     else if (c == '(' || c == ')' || c == '{' || c == '|') {
4018       r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4019       break;
4020     }
4021   }
4022 
4023   onig_scan_env_set_error_string(env, r, *src, prev);
4024   return r;
4025 }
4026 
4027 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4028 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4029 		    ScanEnv* env)
4030 {
4031   int r, ctype;
4032   CClassNode* cc;
4033 
4034   ctype = fetch_char_property_to_ctype(src, end, env);
4035   if (ctype < 0) return ctype;
4036 
4037   *np = node_new_cclass();
4038   CHECK_NULL_RETURN_MEMERR(*np);
4039   cc = NCCLASS(*np);
4040   r = add_ctype_to_cc(cc, ctype, 0, env);
4041   if (r != 0) return r;
4042   if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4043 
4044   return 0;
4045 }
4046 
4047 
4048 enum CCSTATE {
4049   CCS_VALUE,
4050   CCS_RANGE,
4051   CCS_COMPLETE,
4052   CCS_START
4053 };
4054 
4055 enum CCVALTYPE {
4056   CCV_SB,
4057   CCV_CODE_POINT,
4058   CCV_CLASS
4059 };
4060 
4061 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4062 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4063 		 enum CCSTATE* state, ScanEnv* env)
4064 {
4065   int r;
4066 
4067   if (*state == CCS_RANGE)
4068     return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4069 
4070   if (*state == CCS_VALUE && *type != CCV_CLASS) {
4071     if (*type == CCV_SB)
4072       BITSET_SET_BIT(cc->bs, (int )(*vs));
4073     else if (*type == CCV_CODE_POINT) {
4074       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4075       if (r < 0) return r;
4076     }
4077   }
4078 
4079   *state = CCS_VALUE;
4080   *type  = CCV_CLASS;
4081   return 0;
4082 }
4083 
4084 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4085 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4086 	       int* vs_israw, int v_israw,
4087 	       enum CCVALTYPE intype, enum CCVALTYPE* type,
4088 	       enum CCSTATE* state, ScanEnv* env)
4089 {
4090   int r;
4091 
4092   switch (*state) {
4093   case CCS_VALUE:
4094     if (*type == CCV_SB)
4095       BITSET_SET_BIT(cc->bs, (int )(*vs));
4096     else if (*type == CCV_CODE_POINT) {
4097       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4098       if (r < 0) return r;
4099     }
4100     break;
4101 
4102   case CCS_RANGE:
4103     if (intype == *type) {
4104       if (intype == CCV_SB) {
4105         if (*vs > 0xff || v > 0xff)
4106           return ONIGERR_INVALID_CODE_POINT_VALUE;
4107 
4108 	if (*vs > v) {
4109 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4110 	    goto ccs_range_end;
4111 	  else
4112 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4113 	}
4114 	bitset_set_range(cc->bs, (int )*vs, (int )v);
4115       }
4116       else {
4117 	r = add_code_range(&(cc->mbuf), env, *vs, v);
4118 	if (r < 0) return r;
4119       }
4120     }
4121     else {
4122 #if 0
4123       if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4124 #endif
4125 	if (*vs > v) {
4126 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4127 	    goto ccs_range_end;
4128 	  else
4129 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4130 	}
4131 	bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4132 	r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4133 	if (r < 0) return r;
4134 #if 0
4135       }
4136       else
4137 	return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4138 #endif
4139     }
4140   ccs_range_end:
4141     *state = CCS_COMPLETE;
4142     break;
4143 
4144   case CCS_COMPLETE:
4145   case CCS_START:
4146     *state = CCS_VALUE;
4147     break;
4148 
4149   default:
4150     break;
4151   }
4152 
4153   *vs_israw = v_israw;
4154   *vs       = v;
4155   *type     = intype;
4156   return 0;
4157 }
4158 
4159 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4160 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4161 		 ScanEnv* env)
4162 {
4163   int in_esc;
4164   OnigCodePoint code;
4165   OnigEncoding enc = env->enc;
4166   UChar* p = from;
4167 
4168   in_esc = 0;
4169   while (! PEND) {
4170     if (ignore_escaped && in_esc) {
4171       in_esc = 0;
4172     }
4173     else {
4174       PFETCH_S(code);
4175       if (code == c) return 1;
4176       if (code == MC_ESC(env->syntax)) in_esc = 1;
4177     }
4178   }
4179   return 0;
4180 }
4181 
4182 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4183 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4184 		 ScanEnv* env)
4185 {
4186   int r, neg, len, fetched, and_start;
4187   OnigCodePoint v, vs;
4188   UChar *p;
4189   Node* node;
4190   CClassNode *cc, *prev_cc;
4191   CClassNode work_cc;
4192 
4193   enum CCSTATE state;
4194   enum CCVALTYPE val_type, in_type;
4195   int val_israw, in_israw;
4196 
4197   prev_cc = (CClassNode* )NULL;
4198   *np = NULL_NODE;
4199   r = fetch_token_in_cc(tok, src, end, env);
4200   if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4201     neg = 1;
4202     r = fetch_token_in_cc(tok, src, end, env);
4203   }
4204   else {
4205     neg = 0;
4206   }
4207 
4208   if (r < 0) return r;
4209   if (r == TK_CC_CLOSE) {
4210     if (! code_exist_check((OnigCodePoint )']',
4211                            *src, env->pattern_end, 1, env))
4212       return ONIGERR_EMPTY_CHAR_CLASS;
4213 
4214     CC_ESC_WARN(env, (UChar* )"]");
4215     r = tok->type = TK_CHAR;  /* allow []...] */
4216   }
4217 
4218   *np = node = node_new_cclass();
4219   CHECK_NULL_RETURN_MEMERR(node);
4220   cc = NCCLASS(node);
4221 
4222   and_start = 0;
4223   state = CCS_START;
4224   p = *src;
4225   while (r != TK_CC_CLOSE) {
4226     fetched = 0;
4227     switch (r) {
4228     case TK_CHAR:
4229       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4230       if (len > 1) {
4231 	in_type = CCV_CODE_POINT;
4232       }
4233       else if (len < 0) {
4234 	r = len;
4235 	goto err;
4236       }
4237       else {
4238       sb_char:
4239 	in_type = CCV_SB;
4240       }
4241       v = (OnigCodePoint )tok->u.c;
4242       in_israw = 0;
4243       goto val_entry2;
4244       break;
4245 
4246     case TK_RAW_BYTE:
4247       /* tok->base != 0 : octal or hexadec. */
4248       if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4249 	UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4250 	UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4251 	UChar* psave = p;
4252 	int i, base = tok->base;
4253 
4254 	buf[0] = (UChar)tok->u.c;
4255 	for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4256 	  r = fetch_token_in_cc(tok, &p, end, env);
4257 	  if (r < 0) goto err;
4258 	  if (r != TK_RAW_BYTE || tok->base != base) {
4259 	    fetched = 1;
4260 	    break;
4261 	  }
4262 	  buf[i] = (UChar)tok->u.c;
4263 	}
4264 
4265 	if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4266 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4267 	  goto err;
4268 	}
4269 
4270 	len = enclen(env->enc, buf);
4271 	if (i < len) {
4272 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4273 	  goto err;
4274 	}
4275 	else if (i > len) { /* fetch back */
4276 	  p = psave;
4277 	  for (i = 1; i < len; i++) {
4278 	    r = fetch_token_in_cc(tok, &p, end, env);
4279 	  }
4280 	  fetched = 0;
4281 	}
4282 
4283 	if (i == 1) {
4284 	  v = (OnigCodePoint )buf[0];
4285 	  goto raw_single;
4286 	}
4287 	else {
4288 	  v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4289 	  in_type = CCV_CODE_POINT;
4290 	}
4291       }
4292       else {
4293 	v = (OnigCodePoint )tok->u.c;
4294       raw_single:
4295 	in_type = CCV_SB;
4296       }
4297       in_israw = 1;
4298       goto val_entry2;
4299       break;
4300 
4301     case TK_CODE_POINT:
4302       v = tok->u.code;
4303       in_israw = 1;
4304     val_entry:
4305       len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4306       if (len < 0) {
4307 	r = len;
4308 	goto err;
4309       }
4310       in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4311     val_entry2:
4312       r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4313 			 &state, env);
4314       if (r != 0) goto err;
4315       break;
4316 
4317     case TK_POSIX_BRACKET_OPEN:
4318       r = parse_posix_bracket(cc, &p, end, env);
4319       if (r < 0) goto err;
4320       if (r == 1) {  /* is not POSIX bracket */
4321 	CC_ESC_WARN(env, (UChar* )"[");
4322 	p = tok->backp;
4323 	v = (OnigCodePoint )tok->u.c;
4324 	in_israw = 0;
4325 	goto val_entry;
4326       }
4327       goto next_class;
4328       break;
4329 
4330     case TK_CHAR_TYPE:
4331       r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4332       if (r != 0) return r;
4333 
4334     next_class:
4335       r = next_state_class(cc, &vs, &val_type, &state, env);
4336       if (r != 0) goto err;
4337       break;
4338 
4339     case TK_CHAR_PROPERTY:
4340       {
4341 	int ctype;
4342 
4343 	ctype = fetch_char_property_to_ctype(&p, end, env);
4344 	if (ctype < 0) return ctype;
4345 	r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4346 	if (r != 0) return r;
4347 	goto next_class;
4348       }
4349       break;
4350 
4351     case TK_CC_RANGE:
4352       if (state == CCS_VALUE) {
4353 	r = fetch_token_in_cc(tok, &p, end, env);
4354 	if (r < 0) goto err;
4355 	fetched = 1;
4356 	if (r == TK_CC_CLOSE) { /* allow [x-] */
4357 	range_end_val:
4358 	  v = (OnigCodePoint )'-';
4359 	  in_israw = 0;
4360 	  goto val_entry;
4361 	}
4362 	else if (r == TK_CC_AND) {
4363 	  CC_ESC_WARN(env, (UChar* )"-");
4364 	  goto range_end_val;
4365 	}
4366 	state = CCS_RANGE;
4367       }
4368       else if (state == CCS_START) {
4369 	/* [-xa] is allowed */
4370 	v = (OnigCodePoint )tok->u.c;
4371 	in_israw = 0;
4372 
4373 	r = fetch_token_in_cc(tok, &p, end, env);
4374 	if (r < 0) goto err;
4375 	fetched = 1;
4376 	/* [--x] or [a&&-x] is warned. */
4377 	if (r == TK_CC_RANGE || and_start != 0)
4378 	  CC_ESC_WARN(env, (UChar* )"-");
4379 
4380 	goto val_entry;
4381       }
4382       else if (state == CCS_RANGE) {
4383 	CC_ESC_WARN(env, (UChar* )"-");
4384 	goto sb_char;  /* [!--x] is allowed */
4385       }
4386       else { /* CCS_COMPLETE */
4387 	r = fetch_token_in_cc(tok, &p, end, env);
4388 	if (r < 0) goto err;
4389 	fetched = 1;
4390 	if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4391 	else if (r == TK_CC_AND) {
4392 	  CC_ESC_WARN(env, (UChar* )"-");
4393 	  goto range_end_val;
4394 	}
4395 
4396 	if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4397 	  CC_ESC_WARN(env, (UChar* )"-");
4398 	  goto sb_char;   /* [0-9-a] is allowed as [0-9\-a] */
4399 	}
4400 	r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4401 	goto err;
4402       }
4403       break;
4404 
4405     case TK_CC_CC_OPEN: /* [ */
4406       {
4407 	Node *anode;
4408 	CClassNode* acc;
4409 
4410 	r = parse_char_class(&anode, tok, &p, end, env);
4411 	if (r != 0) goto cc_open_err;
4412 	acc = NCCLASS(anode);
4413 	r = or_cclass(cc, acc, env->enc);
4414 
4415 	onig_node_free(anode);
4416       cc_open_err:
4417 	if (r != 0) goto err;
4418       }
4419       break;
4420 
4421     case TK_CC_AND: /* && */
4422       {
4423 	if (state == CCS_VALUE) {
4424 	  r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4425 			     &val_type, &state, env);
4426 	  if (r != 0) goto err;
4427 	}
4428 	/* initialize local variables */
4429 	and_start = 1;
4430 	state = CCS_START;
4431 
4432 	if (IS_NOT_NULL(prev_cc)) {
4433 	  r = and_cclass(prev_cc, cc, env->enc);
4434 	  if (r != 0) goto err;
4435 	  bbuf_free(cc->mbuf);
4436 	}
4437 	else {
4438 	  prev_cc = cc;
4439 	  cc = &work_cc;
4440 	}
4441 	initialize_cclass(cc);
4442       }
4443       break;
4444 
4445     case TK_EOT:
4446       r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4447       goto err;
4448       break;
4449     default:
4450       r = ONIGERR_PARSER_BUG;
4451       goto err;
4452       break;
4453     }
4454 
4455     if (fetched)
4456       r = tok->type;
4457     else {
4458       r = fetch_token_in_cc(tok, &p, end, env);
4459       if (r < 0) goto err;
4460     }
4461   }
4462 
4463   if (state == CCS_VALUE) {
4464     r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4465 		       &val_type, &state, env);
4466     if (r != 0) goto err;
4467   }
4468 
4469   if (IS_NOT_NULL(prev_cc)) {
4470     r = and_cclass(prev_cc, cc, env->enc);
4471     if (r != 0) goto err;
4472     bbuf_free(cc->mbuf);
4473     cc = prev_cc;
4474   }
4475 
4476   if (neg != 0)
4477     NCCLASS_SET_NOT(cc);
4478   else
4479     NCCLASS_CLEAR_NOT(cc);
4480   if (IS_NCCLASS_NOT(cc) &&
4481       IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4482     int is_empty;
4483 
4484     is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4485     if (is_empty != 0)
4486       BITSET_IS_EMPTY(cc->bs, is_empty);
4487 
4488     if (is_empty == 0) {
4489 #define NEWLINE_CODE    0x0a
4490 
4491       if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4492         if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4493           BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4494         else
4495           add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4496       }
4497     }
4498   }
4499   *src = p;
4500   return 0;
4501 
4502  err:
4503   if (cc != NCCLASS(*np))
4504     bbuf_free(cc->mbuf);
4505   onig_node_free(*np);
4506   return r;
4507 }
4508 
4509 static int parse_subexp(Node** top, OnigToken* tok, int term,
4510 			UChar** src, UChar* end, ScanEnv* env);
4511 
4512 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4513 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4514 	      ScanEnv* env)
4515 {
4516   int r, num;
4517   Node *target;
4518   OnigOptionType option;
4519   OnigCodePoint c;
4520   OnigEncoding enc = env->enc;
4521 
4522 #ifdef USE_NAMED_GROUP
4523   int list_capture;
4524 #endif
4525 
4526   UChar* p = *src;
4527   PFETCH_READY;
4528 
4529   *np = NULL;
4530   if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4531 
4532   option = env->option;
4533   if (PPEEK_IS('?') &&
4534       IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4535     PINC;
4536     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4537 
4538     PFETCH(c);
4539     switch (c) {
4540     case ':':   /* (?:...) grouping only */
4541     group:
4542       r = fetch_token(tok, &p, end, env);
4543       if (r < 0) return r;
4544       r = parse_subexp(np, tok, term, &p, end, env);
4545       if (r < 0) return r;
4546       *src = p;
4547       return 1; /* group */
4548       break;
4549 
4550     case '=':
4551       *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4552       break;
4553     case '!':  /*         preceding read */
4554       *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4555       break;
4556     case '>':            /* (?>...) stop backtrack */
4557       *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4558       break;
4559 
4560 #ifdef USE_NAMED_GROUP
4561     case '\'':
4562       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4563 	goto named_group1;
4564       }
4565       else
4566 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4567       break;
4568 #endif
4569 
4570     case '<':   /* look behind (?<=...), (?<!...) */
4571       PFETCH(c);
4572       if (c == '=')
4573 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4574       else if (c == '!')
4575 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4576 #ifdef USE_NAMED_GROUP
4577       else {
4578 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4579 	  UChar *name;
4580 	  UChar *name_end;
4581 
4582 	  PUNFETCH;
4583 	  c = '<';
4584 
4585 	named_group1:
4586 	  list_capture = 0;
4587 
4588 	named_group2:
4589 	  name = p;
4590 	  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4591 	  if (r < 0) return r;
4592 
4593 	  num = scan_env_add_mem_entry(env);
4594 	  if (num < 0) return num;
4595 	  if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4596 	    return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4597 
4598 	  r = name_add(env->reg, name, name_end, num, env);
4599 	  if (r != 0) return r;
4600 	  *np = node_new_enclose_memory(env->option, 1);
4601 	  CHECK_NULL_RETURN_MEMERR(*np);
4602 	  NENCLOSE(*np)->regnum = num;
4603 	  if (list_capture != 0)
4604 	    BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4605 	  env->num_named++;
4606 	}
4607 	else {
4608 	  return ONIGERR_UNDEFINED_GROUP_OPTION;
4609 	}
4610       }
4611 #else
4612       else {
4613 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4614       }
4615 #endif
4616       break;
4617 
4618     case '@':
4619       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4620 #ifdef USE_NAMED_GROUP
4621 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4622 	  PFETCH(c);
4623 	  if (c == '<' || c == '\'') {
4624 	    list_capture = 1;
4625 	    goto named_group2; /* (?@<name>...) */
4626 	  }
4627 	  PUNFETCH;
4628 	}
4629 #endif
4630 	*np = node_new_enclose_memory(env->option, 0);
4631 	CHECK_NULL_RETURN_MEMERR(*np);
4632 	num = scan_env_add_mem_entry(env);
4633 	if (num < 0) {
4634 	  onig_node_free(*np);
4635 	  return num;
4636 	}
4637 	else if (num >= (int )BIT_STATUS_BITS_NUM) {
4638 	  onig_node_free(*np);
4639 	  return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4640 	}
4641 	NENCLOSE(*np)->regnum = num;
4642 	BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4643       }
4644       else {
4645 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4646       }
4647       break;
4648 
4649 #ifdef USE_POSIXLINE_OPTION
4650     case 'p':
4651 #endif
4652     case '-': case 'i': case 'm': case 's': case 'x':
4653       {
4654 	int neg = 0;
4655 
4656 	while (1) {
4657 	  switch (c) {
4658 	  case ':':
4659 	  case ')':
4660 	  break;
4661 
4662 	  case '-':  neg = 1; break;
4663 	  case 'x':  ONOFF(option, ONIG_OPTION_EXTEND,     neg); break;
4664 	  case 'i':  ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4665 	  case 's':
4666 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4667 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4668 	    }
4669 	    else
4670 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4671 	    break;
4672 
4673 	  case 'm':
4674 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4675 	      ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4676 	    }
4677 	    else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4678 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4679 	    }
4680 	    else
4681 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4682 	    break;
4683 #ifdef USE_POSIXLINE_OPTION
4684 	  case 'p':
4685 	    ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4686 	    break;
4687 #endif
4688 	  default:
4689 	    return ONIGERR_UNDEFINED_GROUP_OPTION;
4690 	  }
4691 
4692 	  if (c == ')') {
4693 	    *np = node_new_option(option);
4694 	    CHECK_NULL_RETURN_MEMERR(*np);
4695 	    *src = p;
4696 	    return 2; /* option only */
4697 	  }
4698 	  else if (c == ':') {
4699 	    OnigOptionType prev = env->option;
4700 
4701 	    env->option     = option;
4702 	    r = fetch_token(tok, &p, end, env);
4703 	    if (r < 0) return r;
4704 	    r = parse_subexp(&target, tok, term, &p, end, env);
4705 	    env->option = prev;
4706 	    if (r < 0) return r;
4707 	    *np = node_new_option(option);
4708 	    CHECK_NULL_RETURN_MEMERR(*np);
4709 	    NENCLOSE(*np)->target = target;
4710 	    *src = p;
4711 	    return 0;
4712 	  }
4713 
4714 	  if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4715 	  PFETCH(c);
4716 	}
4717       }
4718       break;
4719 
4720     default:
4721       return ONIGERR_UNDEFINED_GROUP_OPTION;
4722     }
4723   }
4724   else {
4725     if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4726       goto group;
4727 
4728     *np = node_new_enclose_memory(env->option, 0);
4729     CHECK_NULL_RETURN_MEMERR(*np);
4730     num = scan_env_add_mem_entry(env);
4731     if (num < 0) return num;
4732     NENCLOSE(*np)->regnum = num;
4733   }
4734 
4735   CHECK_NULL_RETURN_MEMERR(*np);
4736   r = fetch_token(tok, &p, end, env);
4737   if (r < 0) return r;
4738   r = parse_subexp(&target, tok, term, &p, end, env);
4739   if (r < 0) return r;
4740 
4741   if (NTYPE(*np) == NT_ANCHOR)
4742     NANCHOR(*np)->target = target;
4743   else {
4744     NENCLOSE(*np)->target = target;
4745     if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4746       /* Don't move this to previous of parse_subexp() */
4747       r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4748       if (r != 0) return r;
4749     }
4750   }
4751 
4752   *src = p;
4753   return 0;
4754 }
4755 
4756 static const char* PopularQStr[] = {
4757   "?", "*", "+", "??", "*?", "+?"
4758 };
4759 
4760 static const char* ReduceQStr[] = {
4761   "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4762 };
4763 
4764 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4765 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4766 {
4767   QtfrNode* qn;
4768 
4769   qn = NQTFR(qnode);
4770   if (qn->lower == 1 && qn->upper == 1) {
4771     return 1;
4772   }
4773 
4774   switch (NTYPE(target)) {
4775   case NT_STR:
4776     if (! group) {
4777       StrNode* sn = NSTR(target);
4778       if (str_node_can_be_split(sn, env->enc)) {
4779 	Node* n = str_node_split_last_char(sn, env->enc);
4780 	if (IS_NOT_NULL(n)) {
4781 	  qn->target = n;
4782 	  return 2;
4783 	}
4784       }
4785     }
4786     break;
4787 
4788   case NT_QTFR:
4789     { /* check redundant double repeat. */
4790       /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4791       QtfrNode* qnt   = NQTFR(target);
4792       int nestq_num   = popular_quantifier_num(qn);
4793       int targetq_num = popular_quantifier_num(qnt);
4794       if (nestq_num < 0 || targetq_num < 0) {
4795         return ONIGERR_TYPE_BUG;
4796       }
4797 
4798 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4799       if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4800 	  IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4801         UChar buf[WARN_BUFSIZE];
4802 
4803         switch(ReduceTypeTable[targetq_num][nestq_num]) {
4804         case RQ_ASIS:
4805           break;
4806 
4807         case RQ_DEL:
4808           if (onig_verb_warn != onig_null_warn) {
4809             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4810                                  env->pattern, env->pattern_end,
4811                                  (UChar* )"redundant nested repeat operator");
4812             (*onig_verb_warn)((char* )buf);
4813           }
4814           goto warn_exit;
4815           break;
4816 
4817         default:
4818           if (onig_verb_warn != onig_null_warn) {
4819             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4820                                        env->pattern, env->pattern_end,
4821             (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4822             PopularQStr[targetq_num], PopularQStr[nestq_num],
4823             ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4824             (*onig_verb_warn)((char* )buf);
4825           }
4826           goto warn_exit;
4827           break;
4828         }
4829       }
4830 
4831     warn_exit:
4832 #endif
4833       if (targetq_num >= 0) {
4834 	if (nestq_num >= 0) {
4835 	  onig_reduce_nested_quantifier(qnode, target);
4836 	  goto q_exit;
4837 	}
4838 	else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4839 	  /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4840 	  if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4841 	    qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4842 	  }
4843 	}
4844       }
4845     }
4846     break;
4847 
4848   default:
4849     break;
4850   }
4851 
4852   qn->target = target;
4853  q_exit:
4854   return 0;
4855 }
4856 
4857 
4858 #ifdef USE_SHARED_CCLASS_TABLE
4859 
4860 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS     8
4861 
4862 /* for ctype node hash table */
4863 
4864 typedef struct {
4865   OnigEncoding enc;
4866   int not;
4867   int type;
4868 } type_cclass_key;
4869 
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4870 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4871 {
4872   if (x->type != y->type) return 1;
4873   if (x->enc  != y->enc)  return 1;
4874   if (x->not  != y->not)  return 1;
4875   return 0;
4876 }
4877 
type_cclass_hash(type_cclass_key * key)4878 static int type_cclass_hash(type_cclass_key* key)
4879 {
4880   int i, val;
4881   UChar *p;
4882 
4883   val = 0;
4884 
4885   p = (UChar* )&(key->enc);
4886   for (i = 0; i < (int )sizeof(key->enc); i++) {
4887     val = val * 997 + (int )*p++;
4888   }
4889 
4890   p = (UChar* )(&key->type);
4891   for (i = 0; i < (int )sizeof(key->type); i++) {
4892     val = val * 997 + (int )*p++;
4893   }
4894 
4895   val += key->not;
4896   return val + (val >> 5);
4897 }
4898 
4899 static struct st_hash_type type_type_cclass_hash = {
4900     type_cclass_cmp,
4901     type_cclass_hash,
4902 };
4903 
4904 static st_table* OnigTypeCClassTable;
4905 
4906 
4907 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg ARG_UNUSED)4908 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4909 {
4910   if (IS_NOT_NULL(node)) {
4911     CClassNode* cc = NCCLASS(node);
4912     if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4913     xfree(node);
4914   }
4915 
4916   if (IS_NOT_NULL(key)) xfree(key);
4917   return ST_DELETE;
4918 }
4919 
4920 extern int
onig_free_shared_cclass_table(void)4921 onig_free_shared_cclass_table(void)
4922 {
4923   if (IS_NOT_NULL(OnigTypeCClassTable)) {
4924     onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4925     onig_st_free_table(OnigTypeCClassTable);
4926     OnigTypeCClassTable = NULL;
4927   }
4928 
4929   return 0;
4930 }
4931 
4932 #endif /* USE_SHARED_CCLASS_TABLE */
4933 
4934 
4935 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4936 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4937 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4938 {
4939   BBuf *tbuf;
4940   int r;
4941 
4942   if (IS_NCCLASS_NOT(cc)) {
4943     bitset_invert(cc->bs);
4944 
4945     if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4946       r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4947       if (r != 0) return r;
4948 
4949       bbuf_free(cc->mbuf);
4950       cc->mbuf = tbuf;
4951     }
4952 
4953     NCCLASS_CLEAR_NOT(cc);
4954   }
4955 
4956   return 0;
4957 }
4958 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4959 
4960 typedef struct {
4961   ScanEnv*    env;
4962   CClassNode* cc;
4963   Node*       alt_root;
4964   Node**      ptail;
4965 } IApplyCaseFoldArg;
4966 
4967 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4968 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4969 		  int to_len, void* arg)
4970 {
4971   IApplyCaseFoldArg* iarg;
4972   ScanEnv* env;
4973   CClassNode* cc;
4974   BitSetRef bs;
4975 
4976   iarg = (IApplyCaseFoldArg* )arg;
4977   env = iarg->env;
4978   cc  = iarg->cc;
4979   bs = cc->bs;
4980 
4981   if (to_len == 1) {
4982     int is_in = onig_is_code_in_cc(env->enc, from, cc);
4983 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4984     if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4985 	(is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
4986       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4987 	add_code_range(&(cc->mbuf), env, *to, *to);
4988       }
4989       else {
4990 	BITSET_SET_BIT(bs, *to);
4991       }
4992     }
4993 #else
4994     if (is_in != 0) {
4995       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4996 	if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4997 	add_code_range(&(cc->mbuf), env, *to, *to);
4998       }
4999       else {
5000 	if (IS_NCCLASS_NOT(cc)) {
5001 	  BITSET_CLEAR_BIT(bs, *to);
5002 	}
5003 	else
5004 	  BITSET_SET_BIT(bs, *to);
5005       }
5006     }
5007 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5008   }
5009   else {
5010     int r, i, len;
5011     UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5012     Node *snode = NULL_NODE;
5013 
5014     if (onig_is_code_in_cc(env->enc, from, cc)
5015 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5016 	&& !IS_NCCLASS_NOT(cc)
5017 #endif
5018 	) {
5019       for (i = 0; i < to_len; i++) {
5020 	len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5021 	if (i == 0) {
5022 	  snode = onig_node_new_str(buf, buf + len);
5023 	  CHECK_NULL_RETURN_MEMERR(snode);
5024 
5025 	  /* char-class expanded multi-char only
5026 	     compare with string folded at match time. */
5027 	  NSTRING_SET_AMBIG(snode);
5028 	}
5029 	else {
5030 	  r = onig_node_str_cat(snode, buf, buf + len);
5031 	  if (r < 0) {
5032 	    onig_node_free(snode);
5033 	    return r;
5034 	  }
5035 	}
5036       }
5037 
5038       *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5039       CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5040       iarg->ptail = &(NCDR((*(iarg->ptail))));
5041     }
5042   }
5043 
5044   return 0;
5045 }
5046 
5047 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5048 parse_exp(Node** np, OnigToken* tok, int term,
5049 	  UChar** src, UChar* end, ScanEnv* env)
5050 {
5051   int r, len, group = 0;
5052   Node* qn;
5053   Node** targetp;
5054 
5055   *np = NULL;
5056   if (tok->type == (enum TokenSyms )term)
5057     goto end_of_token;
5058 
5059   switch (tok->type) {
5060   case TK_ALT:
5061   case TK_EOT:
5062   end_of_token:
5063   *np = node_new_empty();
5064   return tok->type;
5065   break;
5066 
5067   case TK_SUBEXP_OPEN:
5068     r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5069     if (r < 0) return r;
5070     if (r == 1) group = 1;
5071     else if (r == 2) { /* option only */
5072       Node* target;
5073       OnigOptionType prev = env->option;
5074 
5075       env->option = NENCLOSE(*np)->option;
5076       r = fetch_token(tok, src, end, env);
5077       if (r < 0) return r;
5078       r = parse_subexp(&target, tok, term, src, end, env);
5079       env->option = prev;
5080       if (r < 0) return r;
5081       NENCLOSE(*np)->target = target;
5082       return tok->type;
5083     }
5084     break;
5085 
5086   case TK_SUBEXP_CLOSE:
5087     if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5088       return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5089 
5090     if (tok->escaped) goto tk_raw_byte;
5091     else goto tk_byte;
5092     break;
5093 
5094   case TK_STRING:
5095   tk_byte:
5096     {
5097       *np = node_new_str(tok->backp, *src);
5098       CHECK_NULL_RETURN_MEMERR(*np);
5099 
5100       while (1) {
5101 	r = fetch_token(tok, src, end, env);
5102 	if (r < 0) return r;
5103 	if (r != TK_STRING) break;
5104 
5105 	r = onig_node_str_cat(*np, tok->backp, *src);
5106 	if (r < 0) return r;
5107       }
5108 
5109     string_end:
5110       targetp = np;
5111       goto repeat;
5112     }
5113     break;
5114 
5115   case TK_RAW_BYTE:
5116   tk_raw_byte:
5117     {
5118       *np = node_new_str_raw_char((UChar )tok->u.c);
5119       CHECK_NULL_RETURN_MEMERR(*np);
5120       len = 1;
5121       while (1) {
5122 	if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5123 	  if (len == enclen(env->enc, NSTR(*np)->s)) {
5124 	    r = fetch_token(tok, src, end, env);
5125 	    NSTRING_CLEAR_RAW(*np);
5126 	    goto string_end;
5127 	  }
5128 	}
5129 
5130 	r = fetch_token(tok, src, end, env);
5131 	if (r < 0) return r;
5132 	if (r != TK_RAW_BYTE) {
5133 	  /* Don't use this, it is wrong for little endian encodings. */
5134 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5135 	  int rem;
5136 	  if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5137 	    rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5138 	    (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5139 	    if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5140 	      NSTRING_CLEAR_RAW(*np);
5141 	      goto string_end;
5142 	    }
5143 	  }
5144 #endif
5145 	  return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5146 	}
5147 
5148 	r = node_str_cat_char(*np, (UChar )tok->u.c);
5149 	if (r < 0) return r;
5150 
5151 	len++;
5152       }
5153     }
5154     break;
5155 
5156   case TK_CODE_POINT:
5157     {
5158       UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5159       int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5160       if (num < 0) return num;
5161 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5162       *np = node_new_str_raw(buf, buf + num);
5163 #else
5164       *np = node_new_str(buf, buf + num);
5165 #endif
5166       CHECK_NULL_RETURN_MEMERR(*np);
5167     }
5168     break;
5169 
5170   case TK_QUOTE_OPEN:
5171     {
5172       OnigCodePoint end_op[2];
5173       UChar *qstart, *qend, *nextp;
5174 
5175       end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5176       end_op[1] = (OnigCodePoint )'E';
5177       qstart = *src;
5178       qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5179       if (IS_NULL(qend)) {
5180 	nextp = qend = end;
5181       }
5182       *np = node_new_str(qstart, qend);
5183       CHECK_NULL_RETURN_MEMERR(*np);
5184       *src = nextp;
5185     }
5186     break;
5187 
5188   case TK_CHAR_TYPE:
5189     {
5190       switch (tok->u.prop.ctype) {
5191       case ONIGENC_CTYPE_WORD:
5192 	*np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5193 	CHECK_NULL_RETURN_MEMERR(*np);
5194 	break;
5195 
5196       case ONIGENC_CTYPE_SPACE:
5197       case ONIGENC_CTYPE_DIGIT:
5198       case ONIGENC_CTYPE_XDIGIT:
5199 	{
5200 	  CClassNode* cc;
5201 
5202 #ifdef USE_SHARED_CCLASS_TABLE
5203           const OnigCodePoint *mbr;
5204 	  OnigCodePoint sb_out;
5205 
5206           r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5207 					   &sb_out, &mbr);
5208           if (r == 0 &&
5209               ONIGENC_CODE_RANGE_NUM(mbr)
5210               >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5211             type_cclass_key  key;
5212             type_cclass_key* new_key;
5213 
5214             key.enc  = env->enc;
5215             key.not  = tok->u.prop.not;
5216             key.type = tok->u.prop.ctype;
5217 
5218             THREAD_ATOMIC_START;
5219 
5220             if (IS_NULL(OnigTypeCClassTable)) {
5221               OnigTypeCClassTable
5222                 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5223               if (IS_NULL(OnigTypeCClassTable)) {
5224                 THREAD_ATOMIC_END;
5225                 return ONIGERR_MEMORY;
5226               }
5227             }
5228             else {
5229               if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )(UINTN)&key,
5230                                  (st_data_t* )np)) {
5231                 THREAD_ATOMIC_END;
5232                 break;
5233               }
5234             }
5235 
5236             *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5237 						     sb_out, mbr);
5238             if (IS_NULL(*np)) {
5239               THREAD_ATOMIC_END;
5240               return ONIGERR_MEMORY;
5241             }
5242 
5243             cc = NCCLASS(*np);
5244             NCCLASS_SET_SHARE(cc);
5245             new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5246             CHECK_NULL_RETURN_MEMERR(new_key);
5247 	    xmemcpy(new_key, &key, sizeof(type_cclass_key));
5248             onig_st_add_direct(OnigTypeCClassTable, (st_data_t )(UINTN)new_key,
5249                                (st_data_t )(UINTN)*np);
5250 
5251             THREAD_ATOMIC_END;
5252           }
5253           else {
5254 #endif
5255             *np = node_new_cclass();
5256             CHECK_NULL_RETURN_MEMERR(*np);
5257             cc = NCCLASS(*np);
5258             add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5259             if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5260 #ifdef USE_SHARED_CCLASS_TABLE
5261           }
5262 #endif
5263 	}
5264 	break;
5265 
5266       default:
5267 	return ONIGERR_PARSER_BUG;
5268 	break;
5269       }
5270     }
5271     break;
5272 
5273   case TK_CHAR_PROPERTY:
5274     r = parse_char_property(np, tok, src, end, env);
5275     if (r != 0) return r;
5276     break;
5277 
5278   case TK_CC_OPEN:
5279     {
5280       CClassNode* cc;
5281 
5282       r = parse_char_class(np, tok, src, end, env);
5283       if (r != 0) return r;
5284 
5285       cc = NCCLASS(*np);
5286       if (IS_IGNORECASE(env->option)) {
5287 	IApplyCaseFoldArg iarg;
5288 
5289 	iarg.env      = env;
5290 	iarg.cc       = cc;
5291 	iarg.alt_root = NULL_NODE;
5292 	iarg.ptail    = &(iarg.alt_root);
5293 
5294 	r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5295 					i_apply_case_fold, &iarg);
5296 	if (r != 0) {
5297 	  onig_node_free(iarg.alt_root);
5298 	  return r;
5299 	}
5300 	if (IS_NOT_NULL(iarg.alt_root)) {
5301           Node* work = onig_node_new_alt(*np, iarg.alt_root);
5302           if (IS_NULL(work)) {
5303             onig_node_free(iarg.alt_root);
5304             return ONIGERR_MEMORY;
5305           }
5306           *np = work;
5307 	}
5308       }
5309     }
5310     break;
5311 
5312   case TK_ANYCHAR:
5313     *np = node_new_anychar();
5314     CHECK_NULL_RETURN_MEMERR(*np);
5315     break;
5316 
5317   case TK_ANYCHAR_ANYTIME:
5318     *np = node_new_anychar();
5319     CHECK_NULL_RETURN_MEMERR(*np);
5320     qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5321     CHECK_NULL_RETURN_MEMERR(qn);
5322     NQTFR(qn)->target = *np;
5323     *np = qn;
5324     break;
5325 
5326   case TK_BACKREF:
5327     len = tok->u.backref.num;
5328     *np = node_new_backref(len,
5329 		   (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5330 			   tok->u.backref.by_name,
5331 #ifdef USE_BACKREF_WITH_LEVEL
5332 			   tok->u.backref.exist_level,
5333 			   tok->u.backref.level,
5334 #endif
5335 			   env);
5336     CHECK_NULL_RETURN_MEMERR(*np);
5337     break;
5338 
5339 #ifdef USE_SUBEXP_CALL
5340   case TK_CALL:
5341     {
5342       int gnum = tok->u.call.gnum;
5343 
5344       if (gnum < 0) {
5345 	gnum = BACKREF_REL_TO_ABS(gnum, env);
5346 	if (gnum <= 0)
5347 	  return ONIGERR_INVALID_BACKREF;
5348       }
5349       *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5350       CHECK_NULL_RETURN_MEMERR(*np);
5351       env->num_call++;
5352     }
5353     break;
5354 #endif
5355 
5356   case TK_ANCHOR:
5357     *np = onig_node_new_anchor(tok->u.anchor);
5358     CHECK_NULL_RETURN_MEMERR(*np);
5359     break;
5360 
5361   case TK_OP_REPEAT:
5362   case TK_INTERVAL:
5363     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5364       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5365 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5366       else
5367 	*np = node_new_empty();
5368   CHECK_NULL_RETURN_MEMERR(*np);
5369     }
5370     else {
5371       goto tk_byte;
5372     }
5373     break;
5374 
5375   default:
5376     return ONIGERR_PARSER_BUG;
5377     break;
5378   }
5379 
5380   {
5381     targetp = np;
5382 
5383   re_entry:
5384     r = fetch_token(tok, src, end, env);
5385     if (r < 0) return r;
5386 
5387   repeat:
5388     if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5389       if (is_invalid_quantifier_target(*targetp))
5390 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5391 
5392       qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5393 			       (r == TK_INTERVAL ? 1 : 0));
5394       CHECK_NULL_RETURN_MEMERR(qn);
5395       NQTFR(qn)->greedy = tok->u.repeat.greedy;
5396       r = set_quantifier(qn, *targetp, group, env);
5397       if (r < 0) {
5398 	onig_node_free(qn);
5399 	return r;
5400       }
5401 
5402       if (tok->u.repeat.possessive != 0) {
5403 	Node* en;
5404 	en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5405 	if (IS_NULL(en)) {
5406 	  onig_node_free(qn);
5407 	  return ONIGERR_MEMORY;
5408 	}
5409 	NENCLOSE(en)->target = qn;
5410 	qn = en;
5411       }
5412 
5413       if (r == 0) {
5414 	*targetp = qn;
5415       }
5416       else if (r == 1) {
5417 	onig_node_free(qn);
5418       }
5419       else if (r == 2) { /* split case: /abc+/ */
5420 	Node *tmp;
5421 
5422 	*targetp = node_new_list(*targetp, NULL);
5423 	if (IS_NULL(*targetp)) {
5424 	  onig_node_free(qn);
5425 	  return ONIGERR_MEMORY;
5426 	}
5427 	tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5428 	if (IS_NULL(tmp)) {
5429 	  onig_node_free(qn);
5430 	  return ONIGERR_MEMORY;
5431 	}
5432 	targetp = &(NCAR(tmp));
5433       }
5434       goto re_entry;
5435     }
5436   }
5437 
5438   return r;
5439 }
5440 
5441 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5442 parse_branch(Node** top, OnigToken* tok, int term,
5443 	     UChar** src, UChar* end, ScanEnv* env)
5444 {
5445   int r;
5446   Node *node, **headp;
5447 
5448   *top = NULL;
5449   r = parse_exp(&node, tok, term, src, end, env);
5450   if (r < 0) return r;
5451 
5452   if (r == TK_EOT || r == term || r == TK_ALT) {
5453     *top = node;
5454   }
5455   else {
5456     *top  = node_new_list(node, NULL);
5457     CHECK_NULL_RETURN_MEMERR(*top);
5458     headp = &(NCDR(*top));
5459     while (r != TK_EOT && r != term && r != TK_ALT) {
5460       r = parse_exp(&node, tok, term, src, end, env);
5461       CHECK_NULL_RETURN_MEMERR(node);
5462       if (r < 0) return r;
5463 
5464       if (NTYPE(node) == NT_LIST) {
5465 	*headp = node;
5466 	while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5467 	headp = &(NCDR(node));
5468       }
5469       else {
5470 	*headp = node_new_list(node, NULL);
5471 	headp = &(NCDR(*headp));
5472       }
5473     }
5474   }
5475 
5476   return r;
5477 }
5478 
5479 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5480 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5481 parse_subexp(Node** top, OnigToken* tok, int term,
5482 	     UChar** src, UChar* end, ScanEnv* env)
5483 {
5484   int r;
5485   Node *node, **headp;
5486 
5487   *top = NULL;
5488   r = parse_branch(&node, tok, term, src, end, env);
5489   if (r < 0) {
5490     onig_node_free(node);
5491     return r;
5492   }
5493 
5494   if (r == term) {
5495     *top = node;
5496   }
5497   else if (r == TK_ALT) {
5498     *top  = onig_node_new_alt(node, NULL);
5499     CHECK_NULL_RETURN_MEMERR(*top);
5500     headp = &(NCDR(*top));
5501     while (r == TK_ALT) {
5502       r = fetch_token(tok, src, end, env);
5503       if (r < 0) return r;
5504       r = parse_branch(&node, tok, term, src, end, env);
5505       if (r < 0) return r;
5506 
5507       *headp = onig_node_new_alt(node, NULL);
5508       headp = &(NCDR(*headp));
5509     }
5510 
5511     if (tok->type != (enum TokenSyms )term)
5512       goto err;
5513   }
5514   else {
5515   err:
5516     if (term == TK_SUBEXP_CLOSE)
5517       return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5518     else
5519       return ONIGERR_PARSER_BUG;
5520   }
5521 
5522   return r;
5523 }
5524 
5525 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5526 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5527 {
5528   int r;
5529   OnigToken tok;
5530 
5531   r = fetch_token(&tok, src, end, env);
5532   if (r < 0) return r;
5533   r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5534   if (r < 0) return r;
5535   return 0;
5536 }
5537 
5538 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5539 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5540 		     regex_t* reg, ScanEnv* env)
5541 {
5542   int r;
5543   UChar* p;
5544 
5545 #ifdef USE_NAMED_GROUP
5546   names_clear(reg);
5547 #endif
5548 
5549   scan_env_clear(env);
5550   env->option         = reg->options;
5551   env->case_fold_flag = reg->case_fold_flag;
5552   env->enc            = reg->enc;
5553   env->syntax         = reg->syntax;
5554   env->pattern        = (UChar* )pattern;
5555   env->pattern_end    = (UChar* )end;
5556   env->reg            = reg;
5557 
5558   *root = NULL;
5559   p = (UChar* )pattern;
5560   r = parse_regexp(root, &p, (UChar* )end, env);
5561   reg->num_mem = env->num_mem;
5562   return r;
5563 }
5564 
5565 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5566 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5567 				UChar* arg, UChar* arg_end)
5568 {
5569   env->error     = arg;
5570   env->error_end = arg_end;
5571 }
5572