LeetCode #10 Regular Expression Matching

'.' Matches any single character.
'*' Matches zero or more of the preceding element.

The matching should cover the entire input string (not partial).
The function prototype should be:
bool isMatch(const char *s, const char *p)

Some examples:
isMatch(“aa”,“a”) → false
isMatch(“aa”,“aa”) → true
isMatch(“aaa”,“aa”) → false
isMatch(“aa”, “a*”) → true
isMatch(“aa”, “.") → true
isMatch(“ab”, ".”) → true
isMatch(“aab”, “cab”) → true

分析

看到題目很直覺想到 Finite State Machine，可以使用狀態機的思維來解決，Pattern 每一個字元作為一個狀態 (包含 ‘.’ 任何字元)，串成單一路線的狀態串連，最後則是結束狀態。

對於 pattern 'ab.d' 所建立的狀態機

輸入字串從起始狀態開始判斷，沿著路線走到結束狀態，每一個狀態從輸入字串取一個字元跟該狀態的字元比較，若不符合則 isMatch 直接判斷失敗，若符合則消耗該字元，把剩下的輸入字串丟到下一個狀態繼續做比較。若成功走到結束狀態，且輸入字串已全部被消耗完，則 isMatch 判斷成功。

狀態機消耗輸入字串 s 的示意圖，只有在最上面的 case 下，消耗所有的輸入字串字元，才算是 isMatch = true

考慮到 ‘*’ 符號，包含零個或是多個字元，在這個狀態中不一定只消耗一個字元，因此需要特別處理，根據可輸入字串的資料，消耗一個到多個該狀態處理的字元，剩下的輸入字串丟到下一個狀態去比較。

考慮 '' 符號的特別處理，進入到 'c' 狀態的輸入字串為 'ccc'，考慮消耗各種可能，將 'ccc', 'cc', 'c', 以及 '∅' 丟入到下個狀態進行判斷

解法

使用 Go：

這邊要注意的是 Go 對於字串的處理，字串是採用 utf-8 編碼，雖然題目輸入都會是 ASCII 字元，但保險起見還是使用 rune 來取得每一個字元來判斷。

// import ("unicode/utf8")
var s string
r, width := utf8.DecodeRuneInString(s)
// TODO: check rune 'r'
rest_s := s[width:]

之後建立 interface 來封裝函數，並定義解析 pattern 的方法：

type Matcher interface {
   IsMatch(s string) bool
}

func ParsePattern(p string) Matcher {
   // TODO:
}

因此主流程函數寫成這樣，不用管實作只需要處理邏輯流程：

func isMatch(s string, p string) bool {
   m := ParsePattern(p)
   return m.IsMatch(s)
}

實作 ParsePattern 即可，以下是完整程式碼，雖然有點浪費記憶體：

import (
    "unicode/utf8"
)

type Matcher interface {
    IsMatch(s string) bool
}

func ParsePattern(p string) Matcher {
    curtRune, width := utf8.DecodeRuneInString(p)
    if width <= 0 {
        return &state{}
    }
    nextRune, width2 := utf8.DecodeRuneInString(p[width:])
    s := &state{
        Char:       curtRune,
        AnyChar:    curtRune == '.',
        ZeroOrMore: nextRune == '*',
    }
    if s.ZeroOrMore {
        width += width2
    }
    s.Next = ParsePattern(p[width:])
    return s
}

type state struct {
    Char       rune
    AnyChar    bool
    ZeroOrMore bool
    Next       Matcher
}

func (s state) IsMatchChar(r rune) bool {
    return s.AnyChar || r == s.Char
}

func (s state) IsMatch(str string) bool {
    if s.Next == nil {
        return len(str) <= 0
    } else if len(str) <= 0 {
        if s.ZeroOrMore {
            return s.Next.IsMatch(str)
        }
        return false
    }
    if s.ZeroOrMore {
        // Consider Zero (No char consume)
        if ok := s.Next.IsMatch(str); ok {
            return true
        }
        for len(str) > 0 {
            curtRune, width := utf8.DecodeRuneInString(str)
            if s.IsMatchChar(curtRune) {
                str = str[width:]
                if ok := s.Next.IsMatch(str); ok {
                    return true
                }
            } else {
                return s.Next.IsMatch(str)
            }
        }
    } else {
        curtRune, width := utf8.DecodeRuneInString(str)
        if s.IsMatchChar(curtRune) {
            return s.Next.IsMatch(str[width:])
        }
    }
    return false
}

func isMatch(s string, p string) bool {
    m := ParsePattern(p)
    return m.IsMatch(s)
}

以上程式碼還可以更好更快，不需要額外建立 state 結構省記憶體空間，使用 for loop 不使用 recursive call，避免過長的 pattern 可能導致的 stack overflow 問題等等。


import (
   "unicode/utf8"
)

func isMatch(s string, p string) bool {
   for true {
      r1, w1 := utf8.DecodeRuneInString(p)
      if w1 <= 0 {
         break
      }
      p = p[w1:]

      r2, w2 := utf8.DecodeRuneInString(p)
      if r2 == '*' {
         // Zero or more
         p = p[w2:]
         for true {
            if ok := isMatch(s, p); ok {
               return true
            }
            r3, w3 := utf8.DecodeRuneInString(s)
            if (r1 == '.' || r1 == r3) && w3 > 0 {
               s = s[w3:]
            } else {
               break
            }
         }
      } else {
         // One
         r3, w3 := utf8.DecodeRuneInString(s)
         if w3 <= 0 || !(r1 == '.' || r1 == r3) {

            return false
         }
         s = s[w3:]
      }
   }

   return len(s) <= 0
}