字串核對


 說明

今日的一些高階程式語言對於字串的處理支援越來越強大(例如Java、Perl等),不過字串搜尋本身仍是個值得 探討的課題,在這邊以Boyer- Moore法來說明如何進行字串說明,這個方法快且原理簡潔易懂。

解法

字串搜尋本身不難,使用暴力法也可以求解,但如何快速搜尋字串就不簡單了,傳統的字串搜尋是從關鍵字與字串的開頭 開始比對,例如 Knuth-Morris-Pratt 演算法 字串搜尋,這個方法也不錯,不過要花時間在公式計算上;Boyer-Moore字串核對改由關鍵字的後面開始核對字串,並製作前進表,如果比對不符合則依 前進表中的值前進至下一個核對處,假設是p好了,然後比對字串中p-n+1至p的值是否與關鍵字相同。
字串核對


那麼前進表該如何前進,舉個實際的例子,如果要在字串中搜尋JUST這個字串,則可能遇到的幾個情況如下所示:
字串核對



依照這個例子,可以決定出我們的前進值表如下:
其它 J U S T
4 3 2 1 4(match?)

如果關鍵字中有重複出現的字元,則前進值就會有兩個以上的值,此時則取前進值較小的值,如此就不會跳過可能的位置,例如texture這個關鍵字,t的前 進值應該取後面的3而不是取前面的7。

實作:Toy    C    Java    Python    Scala    Ruby    JavaScript    Haskell

SKIP_TABLE_SIZE = 256

def table(skip, key) { 
    n = key.length()
    iterate(0, SKIP_TABLE_SIZE).forEach(k -> skip.set(k, n))
    iterate(0, n - 1).forEach(k -> skip.set(key.charCodeAt(k), n - k - 1))
} 

def indexOf(skip, from, text, key) { 
    index = from
    while index < text.length() {
        if text.substring(index - key.length() + 1, index + 1) == key {
            return index - key.length() + 1
        }
        index += skip.get(text.charCodeAt(index))
    }
    return -1 
} 

text = input('字串:')
key = input('關鍵字:')

skip = range(0, SKIP_TABLE_SIZE)
table(skip, key)

p = indexOf(skip, key.length() - 1, text, key)
while p != -1 {
    println(text.substring(p, text.length()))
    p = indexOf(skip, p + key.length() + 1, text, key)
}

#include <stdio.h> 
#include <stdlib.h>
#include <string.h>
#define SKIP_TABLE_SIZE 256
#define STRING_LENGTH 80

void table(int*, char*); // 建立前進表
int indexOf(int*, int, char*, char*); // 搜尋關鍵字
void subString(char*, char*, int, int); // 取出子字串

int main(void) {
int skip[SKIP_TABLE_SIZE];

char input[STRING_LENGTH];
char key[STRING_LENGTH];
char sub[STRING_LENGTH] = {'\0'};

printf("字串:");
gets(input);
printf("關鍵字:");
gets(key);

table(skip, key);

int m = strlen(input); // 計算字串長度
int n = strlen(key);
int p = indexOf(skip, n - 1, input, key);

while(p != -1) {
subString(input, sub, p, m);
printf("%s\n", sub);
p = indexOf(skip, p + n + 1, input, key);
}

return 0;
}

void table(int* skip, char *key) {
int n = strlen(key);
int k;
for(k = 0; k < SKIP_TABLE_SIZE; k++) {
skip[k] = n;
}
for(k = 0; k < n - 1; k++) {
skip[key[k]] = n - k - 1;
}
}

int indexOf(int* skip, int from, char* str, char* key) {
char sub[STRING_LENGTH] = {'\0'};

int strLen = strlen(str);
int keyLen = strlen(key);
int index = from;

while(index < strLen) {
subString(str, sub, index - keyLen + 1, index);
if(!strcmp(sub, key)) { // 比較兩字串是否相同
return index - keyLen + 1;
}
index += skip[str[index]];
}

return -1;
}

void subString(char *text, char* sub, int s, int e) {
int i, j;
for(i = s, j = 0; i <= e; i++, j++) {
sub[j] = text[i];
}
sub[j] = '\0';
}

import java.util.*;
import java.io.*;
import static java.lang.System.*;

public class StringMatcher implements Iterable<String> {
private String str;
private String key;
private int[] skip = new int[256];

public StringMatcher(String str, String key) {
this.str = str;
this.key = key;
Arrays.fill(skip, key.length());
for(int k = 0; k < key.length() - 1; k++) {
skip[key.charAt(k)] = key.length() - k - 1;
}
}

public Iterator<String> iterator() { return new Itr(); }

private class Itr implements Iterator<String> {
private int index;

{ index = indexOf(key.length() - 1, str, key); }

private int indexOf(int from, String str, String key) {
int tp = from;
while(tp < str.length() &&
! str.substring(tp - key.length() + 1, tp + 1)
.equals(key)) {
tp += skip[str.charAt(tp)];
}
return tp - key.length() + 1;
}

public boolean hasNext() {
return index < str.length() - 1;
}

public String next() {
String sub = str.substring(index);
index = indexOf(index + key.length() + 1, str, key);
return sub;
}

public void remove() { throw new RuntimeException("Not supported"); }
}

public static void main(String[] args) {
Scanner scanner = new Scanner(in);

out.print("請輸入字串:");
String str = scanner.nextLine();
out.print("請輸入搜尋關鍵字:");
String key = scanner.nextLine();

for(String s : new StringMatcher(str, key)) {
out.println(s);
}
}
}

def matcher(str, key):
strLen = len(str)
keyLen = len(key)

skip = [keyLen - key.rindex(chr(k)) - 1
if(chr(k) in key[:-1]) else keyLen for k in range(256)]

def next(i):
return (next(i + skip[ord(str[i])])
if(i < strLen and str[i - keyLen + 1: i + 1] != key)
else i - keyLen + 1)

def match(i):
nextI = next(i + keyLen + 1)
return [str[i:]] + (match(nextI) if(nextI < strLen - 1) else [])

return match(next(keyLen))

for s in matcher(input("字串:"), input("關鍵字:")):
print(s)

def matcher(str: String, key: String) = {
val skip = for(k <- 0 until 256) yield
if(key.init.contains(k.toChar))
key.length - key.lastIndexOf(k.toChar) - 1
else key.length

def next(i: Int): Int = {
if(i < str.length &&
!str.substring(i - key.length + 1, i + 1).equals(key))
next(i + skip(str.charAt(i).toInt))
else i - key.length + 1
}

def find(i: Int): List[String] = {
val nextI = next(i + key.length + 1)
str.substring(i) :: (if(nextI < str.length - 1) find(nextI) else Nil)
}

find(next(key.length))
}

print("字串:")
val str = readLine
print("關鍵字:")
val key = readLine
matcher(str, key).foreach(println)

# encoding: Big5
def matcher(str, key)
skip = (0...256).map { |k|
if key[0...-1].include? k.chr
key.size - key.rindex(k.chr) - 1
else
key.size
end
}

nextIndex = ->(i) {
if i < str.size and str[i - key.size + 1 ... i + 1] != key
nextIndex.call(i + skip[str[i].ord])
else
i - key.size + 1
end
}

match = ->(i) {
nextI = nextIndex.call(i + key.size + 1)
[str[i..-1]] + if nextI < str.size - 1; match.call(nextI) else [] end
}

return match.call(nextIndex.call(key.size))
end

print "字串:"
str = gets.chomp
print "關鍵字:"
key = gets.chomp

matcher(str, key).each do |s|
puts s
end

  • JavaScript
var matcher = function() {
function range(n) {
var r = [];
for(var i = 0; i < n; i++) { r[i] = i; }
return r;
}

return function(str, key) {
var skip = range(256).map(function(k) {
return key.slice(0, -1)
.indexOf(String.fromCharCode(k)) !== -1 ?
key.length - key.lastIndexOf(String.fromCharCode(k)) - 1 :
key.length;
});

function next(i) {
return i < str.length &&
str.slice(i - key.length + 1, i + 1) !== key ?
next(i + skip[str.charCodeAt(i)]) :
i - key.length + 1
}

function match(i) {
var nextI = next(i + key.length + 1);
return [str.slice(i)].concat(
nextI < str.length - 1 ? match(nextI) : []);
}

return match(next(key.length))
};
}();

matcher('This is a test', 'is').forEach(function(s) {
print(s);
});

import Data.Char
import Data.List

rindex elem list =
let (Just a) = elem `elemIndex` (reverse list)
in length list - a - 1

matcher str key = find $ next keyLen
where strLen = length str
keyLen = length key
skip = [if (chr k) `elem` (init key) then
keyLen - rindex (chr k) key - 1
else keyLen | k <- [0..255]]
next i = if i < strLen &&
(drop (i - keyLen + 1) . take (i + 1) $ str) /= key
then next (i + skip !! (ord (str !! i)))
else i - keyLen + 1
find i = let nextI = next $ i + length key + 1
in (drop i str) : if nextI < strLen - 1 then find nextI
else []

main = do
putStrLn "String..."
str <- getLine
putStrLn "Keyword..."
key <- getLine
sequence [putStrLn s | s <- matcher str key]