[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index][Thread Index][Top&Search][Original]

PATCH [perl #59328] In re's, \N{U+...} doesn't match for ... > 256



The problem is that the space allocated to hold the code point for 
non-charclass was a char, which of course only works up through ord 255.

Patch is attached
--- regcomp.c.orig	2008-11-03 06:54:29.000000000 -0700
+++ regcomp.c	2008-11-05 17:54:08.000000000 -0700
@@ -6617,20 +6617,30 @@
             | PERL_SCAN_DISALLOW_PREFIX
             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
         UV cp;
-	char string;
         len = (STRLEN)(endbrace - name - 2);
         cp = grok_hex(name + 2, &len, &fl, NULL);
         if ( len != (STRLEN)(endbrace - name - 2) ) {
             cp = 0xFFFD;
         }    
-        if (cp > 0xff)
-            RExC_utf8 = 1;
         if ( valuep ) {
+	    if (cp > 0xff) RExC_utf8 = 1;
             *valuep = cp;
             return NULL;
         }
-	string = (char)cp;
-        sv_str= newSVpvn(&string, 1);
+
+	/* Need to convert to utf8 if either: won't fit into a byte, or the re
+	 * is going to be in utf8 and the representation changes under utf8. */
+	if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
+	    U8 string[UTF8_MAXBYTES+1];
+	    U8 *tmps;
+	    RExC_utf8 = 1;
+	    tmps = uvuni_to_utf8(string, cp);
+	    sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
+	} else {    /* Otherwise, no need for utf8, can skip that step */
+	    char string;
+	    string = (char)cp;
+	    sv_str= newSVpvn(&string, 1);
+	}
     } else {
         /* fetch the charnames handler for this scope */
         HV * const table = GvHV(PL_hintgv);
@@ -6809,7 +6819,7 @@
         Set_Node_Cur_Length(ret); /* MJD */
         RExC_parse--; 
         nextchar(pRExC_state);
-    } else {
+    } else {	/* zero length */
         ret = reg_node(pRExC_state,NOTHING);
     }
     if (!cached) {
--- t/op/re_tests.orig	2008-11-03 08:24:51.000000000 -0700
+++ t/op/re_tests	2008-11-05 18:33:53.000000000 -0700
@@ -1358,3 +1358,4 @@
 /^\s*i.*?o\s*$/s	io\n io	y	-	-
 # As reported in #59168 by Father Chrysostomos:
 /(.*?)a(?!(a+)b\2c)/	baaabaac	y	$&-$1	baa-ba
+/\N{U+0100}/	\x{100}	y	$&	\x{100}	# Bug #59328

Follow-Ups from:
demerphq <demerphq@gmail.com>

[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index][Thread Index][Top&Search][Original]