*** Lib/regex_syntax.py	Tue Dec 31 01:02:51 1991
--- Lib/regex_syntax.py	Sun Sep 15 08:36:41 1996
***************
*** 32,39 ****
--- 32,50 ----
  #	*, +, ? - only special when not after the beginning, (, or |
  RE_CONTEXT_INDEP_OPS = 32
  
+ # Treat \w as [A-Za-z0-9_] instead of just [A-Za-z0-9], \W as [^A-Za-z0-9_],
+ # \d as [0-9], \D as [^0-9], \s as [ \t\r\n\f], \S as [^ \t\r\n\f], like Perl.
+ # Additionally, treat \h as a hex digit: [0-9a-fA-F] and \H as [^0-9a-fA-F],
+ # and \l as a letter of the alphabet: [A-Za-z] and \L as [^A-Za-z].
+ RE_EXTRA_CLASSES = 256
+ 
+ # Allow the minimal quantifying operators ??, +?, and *? which will match
+ # the shortest possible piece of the search string instead of the longest.
+ RE_MINIMAL_OPS = 512
+ 
  # Now define combinations of bits for the standard possibilities.
  RE_SYNTAX_AWK = (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_CONTEXT_INDEP_OPS)
+ RE_SYNTAX_PERLISH = (RE_SYNTAX_AWK | RE_EXTRA_CLASSES | RE_MINIMAL_OPS)
  RE_SYNTAX_EGREP = (RE_SYNTAX_AWK | RE_NEWLINE_OR)
  RE_SYNTAX_GREP = (RE_BK_PLUS_QM | RE_NEWLINE_OR)
  RE_SYNTAX_EMACS = 0



*** Modules/regexpr.c	Fri Sep 13 02:57:39 1996
--- Modules/regexpr.c	Sun Sep 15 08:31:49 1996
***************
*** 106,111 ****
--- 106,119 ----
    Rwordend,		/* end of word */
    Rwordbound,		/* word bound */
    Rnotwordbound,	/* not word bound */
+   Rdigitchar,		/* P - digit character */
+   Rnotdigitchar,	/* P - not digit character */
+   Rhexdigitchar,	/* P - hexadecimal digit character */
+   Rnothexdigitchar,	/* P - not hexadecimal digit character */
+   Rwhitespacechar,	/* P - whitespace character */
+   Rnotwhitespacechar,	/* P - not whitespace character */
+   Rletterchar,		/* P - letter of alphabet */
+   Rnotletterchar,	/* P - not letter of alphabet */
  #ifdef emacs
    Remacs_at_dot,	/* emacs: at dot */
    Remacs_syntaxspec,	/* syntaxspec */
***************
*** 122,127 ****
--- 130,136 ----
  static unsigned char regexp_precedences[Rnum_ops];
  static int regexp_context_indep_ops;
  static int regexp_ansi_sequences;
+ static int regexp_minimal_ops;		/* P - added this flag */
  
  #define NUM_LEVELS  5    /* number of precedence levels in use */
  #define MAX_NESTING 100  /* max nesting level of operators */
***************
*** 143,148 ****
--- 152,161 ----
  
  #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
  #define Sword 1
+ #define Sdigit 2
+ #define Shexdigit 4
+ #define Swhitespace 8
+ #define Sletter 16
  
  #ifdef SYNTAX_TABLE
  char *re_syntax_table;
***************
*** 164,177 ****
      {
        syntax_table_inited = 1;
        memset(re_syntax_table, 0, 256);
!       for (a = 'a'; a <= 'z'; a++)
! 	re_syntax_table[a] = Sword;
        for (a = 'A'; a <= 'Z'; a++)
! 	re_syntax_table[a] = Sword;
!       for (a = '0'; a <= '9'; a++)
! 	re_syntax_table[a] = Sword;
      }
  #endif /* !emacs && !SYNTAX_TABLE */
    re_compile_initialized = 1;
    for (a = 0; a < 256; a++)
      {
--- 177,210 ----
      {
        syntax_table_inited = 1;
        memset(re_syntax_table, 0, 256);
!       for (a = 'a'; a <= 'z'; a++)	/* P - added Sletter */
! 	re_syntax_table[a] = Sword | Sletter;
        for (a = 'A'; a <= 'Z'; a++)
! 	re_syntax_table[a] = Sword | Sletter;
! 
!       for (a = '0'; a <= '9'; a++)	/* P - added Sdigit and Shexdigit */
! 	re_syntax_table[a] = Sword | Sdigit | Shexdigit;
!       for (a = 'a'; a <= 'f'; a++)
!         re_syntax_table[a] = Sword | Sletter | Shexdigit;
!       for (a = 'A'; a <= 'F'; a++)
!         re_syntax_table[a] = Sword | Sletter | Shexdigit;
! 
!       re_syntax_table[' '] = Swhitespace;	/* P - added these entries */
!       re_syntax_table['\t'] = Swhitespace;
!       re_syntax_table['\r'] = Swhitespace;
!       re_syntax_table['\n'] = Swhitespace;
!       re_syntax_table['\f'] = Swhitespace;
      }
+ 
+ /* P - The following entry is outside syntax_table_inited because we might */
+ /*     have to update it whenever the regex_syntax changes.		   */
+ 
+   if (regexp_syntax & RE_EXTRA_CLASSES)	/* P - added this clause */
+       re_syntax_table['_'] = Sword;
+   else
+       re_syntax_table['_'] = 0;
  #endif /* !emacs && !SYNTAX_TABLE */
+ 
    re_compile_initialized = 1;
    for (a = 0; a < 256; a++)
      {
***************
*** 228,233 ****
--- 261,279 ----
        regexp_quoted_ops['`'] = Rbegbuf;
        regexp_quoted_ops['\''] = Rendbuf;
      }
+   if (regexp_syntax & RE_EXTRA_CLASSES)		/* P - added this clause */
+     {
+       regexp_quoted_ops['s'] = Rwhitespacechar;
+       regexp_quoted_ops['S'] = Rnotwhitespacechar;
+       regexp_quoted_ops['w'] = Rwordchar;
+       regexp_quoted_ops['W'] = Rnotwordchar;
+       regexp_quoted_ops['d'] = Rdigitchar;
+       regexp_quoted_ops['D'] = Rnotdigitchar;
+       regexp_quoted_ops['h'] = Rhexdigitchar;		/* not part of Perl, */
+       regexp_quoted_ops['H'] = Rnothexdigitchar;	/* but convenient */
+       regexp_quoted_ops['l'] = Rletterchar;		/* not part of Perl, */
+       regexp_quoted_ops['L'] = Rnotletterchar;		/* but convenient */
+     }
    if (regexp_syntax & RE_ANSI_HEX)
      regexp_quoted_ops['v'] = Rextended_memory;
    for (a = 0; a < Rnum_ops; a++)
***************
*** 248,253 ****
--- 294,302 ----
    regexp_precedences[Rend] = 0;
    regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
    regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
+ 
+ 						/* P - added this flag */
+   regexp_minimal_ops = (regexp_syntax & RE_MINIMAL_OPS) != 0;
  }
  
  int re_set_syntax(syntax)
***************
*** 524,529 ****
--- 573,593 ----
  	      goto op_error;
  	    else
  	      goto normal_char;
+ 	  
+ 	  if (regexp_minimal_ops && pos < size && regex[pos] == '?')
+ 	    {			/* P - added this clause for minimal ? */
+ 	      pos++;
+ 	      if (CURRENT_LEVEL_START == pattern_offset)
+ 	        break; /* P - ignore empty patterns for ?? */
+ 
+ 	      ALLOC(6);
+ 	      INSERT_JUMP(CURRENT_LEVEL_START, Cjump,
+ 	        pattern_offset + 3);
+ 	      INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
+ 	        CURRENT_LEVEL_START + 6);
+ 	      break;
+ 	    }
+ 
  	  if (CURRENT_LEVEL_START == pattern_offset)
  	    break; /* ignore empty patterns for ? */
  	  ALLOC(3);
***************
*** 537,542 ****
--- 601,631 ----
  	      goto op_error;
  	    else
  	      goto normal_char;
+ 
+ 	  if (regexp_minimal_ops && pos < size && regex[pos] == '?')
+ 	    {			/* P - added this clause for minimal * and + */
+ 	      pos++;
+ 	      if (CURRENT_LEVEL_START == pattern_offset)
+ 	        break; /* P - ignore empty patterns for *? and +? */
+ 
+ 	      if (op == Rstar)
+ 	        {
+ 	        ALLOC(6);
+ 	        INSERT_JUMP(CURRENT_LEVEL_START, Cjump,
+ 	          pattern_offset + 3);
+ 	        INSERT_JUMP(pattern_offset, Cfailure_jump,
+ 	          CURRENT_LEVEL_START + 3);
+ 		    }
+ 		  else /* op == Rplus */
+ 		    {
+ 		    ALLOC(3);
+ 		    INSERT_JUMP(pattern_offset, Cfailure_jump,
+ 		      CURRENT_LEVEL_START);
+ 		    }
+ 
+ 	      break;
+ 	    }
+ 
  	  if (CURRENT_LEVEL_START == pattern_offset)
  	    break; /* ignore empty patterns for + and * */
  	  ALLOC(9);
***************
*** 686,691 ****
--- 775,812 ----
  	  opcode = Cnotsyntaxspec;
  	  ch = Sword;
  	  goto store_opcode_and_arg;
+ 	case Rdigitchar:		/* P - added this case */
+ 	  opcode = Csyntaxspec;
+ 	  ch = Sdigit;
+ 	  goto store_opcode_and_arg;
+ 	case Rnotdigitchar:		/* P - added this case */
+ 	  opcode = Cnotsyntaxspec;
+ 	  ch = Sdigit;
+ 	  goto store_opcode_and_arg;
+ 	case Rhexdigitchar:		/* P - added this case */
+ 	  opcode = Csyntaxspec;
+ 	  ch = Shexdigit;
+ 	  goto store_opcode_and_arg;
+ 	case Rnothexdigitchar:		/* P - added this case */
+ 	  opcode = Cnotsyntaxspec;
+ 	  ch = Shexdigit;
+ 	  goto store_opcode_and_arg;
+ 	case Rwhitespacechar:		/* P - added this case */
+ 	  opcode = Csyntaxspec;
+ 	  ch = Swhitespace;
+ 	  goto store_opcode_and_arg;
+ 	case Rnotwhitespacechar:	/* P - added this case */
+ 	  opcode = Cnotsyntaxspec;
+ 	  ch = Swhitespace;
+ 	  goto store_opcode_and_arg;
+ 	case Rletterchar:		/* P - added this case */
+ 	  opcode = Csyntaxspec;
+ 	  ch = Sletter;
+ 	  goto store_opcode_and_arg;
+ 	case Rnotletterchar:		/* P - added this case */
+ 	  opcode = Cnotsyntaxspec;
+ 	  ch = Sletter;
+ 	  goto store_opcode_and_arg;
  	case Rwordbeg:
  	  opcode = Cwordbeg;
  	  goto store_opcode;
***************
*** 803,815 ****
        case Csyntaxspec:
  	syntaxcode = code[pos++];
  	for (a = 0; a < 256; a++)
! 	  if (SYNTAX(a) == syntaxcode)
  	    fastmap[a] = 1;
  	return;
        case Cnotsyntaxspec:
  	syntaxcode = code[pos++];
  	for (a = 0; a < 256; a++)
! 	  if (SYNTAX(a) != syntaxcode)
  	    fastmap[a] = 1;
  	return;
        case Ceol:
--- 924,936 ----
        case Csyntaxspec:
  	syntaxcode = code[pos++];
  	for (a = 0; a < 256; a++)
! 	  if (SYNTAX(a) & syntaxcode)		/* P - changed == to & */
  	    fastmap[a] = 1;
  	return;
        case Cnotsyntaxspec:
  	syntaxcode = code[pos++];
  	for (a = 0; a < 256; a++)
! 	  if (!(SYNTAX(a) & syntaxcode))	/* P - changed == to & */
  	    fastmap[a] = 1;
  	return;
        case Ceol:
***************
*** 1337,1350 ****
  	  break;
  	case Csyntaxspec:
  	  NEXTCHAR(ch);
! 	  if (SYNTAX(ch) != (unsigned char)*code++)
! 	    goto fail;
! 	  break;
! 	case Cnotsyntaxspec:
! 	  NEXTCHAR(ch);
! 	  if (SYNTAX(ch) != (unsigned char)*code++)
  	    break;
  	  goto fail;
  #ifdef emacs
  	case Cemacs_at_dot:
  	  if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
--- 1458,1471 ----
  	  break;
  	case Csyntaxspec:
  	  NEXTCHAR(ch);
! 	  if (SYNTAX(ch) & ((unsigned char)*code++)) /* P - changed == to & */
  	    break;
  	  goto fail;
+ 	case Cnotsyntaxspec:
+ 	  NEXTCHAR(ch);
+ 	  if (SYNTAX(ch) & ((unsigned char)*code++)) /* P - changed == to & */
+ 	    goto fail;
+ 	  break;
  #ifdef emacs
  	case Cemacs_at_dot:
  	  if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)



*** Modules/regexpr.h	Sat May 25 05:51:22 1996
--- Modules/regexpr.h	Sun Sep 15 08:34:04 1996
***************
*** 55,63 ****
--- 55,66 ----
  #define RE_CONTEXT_INDEP_OPS	32   /* ^$?*+ are special in all contexts */
  #define RE_ANSI_HEX		64   /* ansi sequences (\n etc) and \xhh */
  #define RE_NO_GNU_EXTENSIONS   128   /* no gnu extensions */
+ #define RE_EXTRA_CLASSES       256   /* \w\W\d\D\s\S\h\H\a\A char classes */
+ #define RE_MINIMAL_OPS	       512   /* allow minimal operators ??, +?, *? */
  
  /* definitions for some common regexp styles */
  #define RE_SYNTAX_AWK	(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_CONTEXT_INDEP_OPS)
+ #define RE_SYNTAX_PERLISH (RE_SYNTAX_AWK|RE_EXTRA_CLASSES|RE_MINIMAL_OPS)
  #define RE_SYNTAX_EGREP	(RE_SYNTAX_AWK|RE_NEWLINE_OR)
  #define RE_SYNTAX_GREP	(RE_BK_PLUS_QM|RE_NEWLINE_OR)
  #define RE_SYNTAX_EMACS	0
