@
ShikiSuen #4
import Foundation
// 创建自定义词典,将需要保留的词汇映射到一个数组,以便后续检查
let customDictionary: [String: [String]] = [
"白术": ["白术"],
"大夫": ["大夫"],
"七七": ["七七"]
]
func tokenize(sentence: String) -> [String] {
var tokens: [String] = []
let tagger = NSLinguisticTagger(tagSchemes: [.tokenType], options: 0)
tagger.string = sentence
let range = NSMakeRange(0, sentence.utf16.count)
let options: NSLinguisticTagger.Options = [.omitWhitespace, .omitPunctuation]
tagger.enumerateTags(in: range, unit: .word, scheme: .tokenType, options: options) { (tag, tokenRange, stop) in
let word = (sentence as NSString).substring(with: tokenRange)
if let specialCases = customDictionary[word] {
tokens.append(contentsOf: specialCases)
} else {
tokens.append(word)
}
}
return tokens
}
let texts: [String] = ["有個大夫叫白朮,他有個徒弟叫七七。"]
for text in texts {
let tokens = tokenize(sentence: text)
print("\(text) --> \(tokens)")
}