Python之 英文单词纠错

  文本纠错作为重新封装的文本分析平台核心功能,一直受到公司各层的广泛关注。说到文本纠错最简单的莫过于英文单词纠错,也是我们word中常见的检查功能。纠错概率计量方法可参考NLP单词纠错中文自动纠错讲解,这里不再赘述。
  语料下载地址:big.txt
  Python英文单词纠错程序如下:

import re
from collections import Counter

#==== 训练一个带有概率的词库 ====
def words(text): 
    return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('d:\\big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

#==== 给定单词A,枚举所有可能正确的拼写 ====
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
#==== 返回候选词  ====
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

#==== 输出概率最大的纠正词  ====
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

  程序结果如下:

correction('corrected')
# 'connected'
correction('speling')
# 'seeing'

分类: 自然语言处理