實現 該 敏感詞過濾 採用的是 DFA演算法,參考文章:https://blog.csdn.net/chenssy/article/details/26961957 具體 實現 步驟 如下: 第一步,構建 敏感詞庫(WordsLibrary) 類: using System.Collections.G ...
實現 該 敏感詞過濾 採用的是 DFA演算法,參考文章:https://blog.csdn.net/chenssy/article/details/26961957
具體 實現 步驟 如下:
第一步,構建 敏感詞庫(WordsLibrary) 類:
using System.Collections.Generic; using System.Linq; using System; namespace ContentSafe.SensitiveWord { /// <summary> /// 敏感詞庫 /// </summary> public class WordsLibrary { /// <summary> /// 詞庫樹結構類 /// </summary> public class ItemTree { public char Item { get; set; } public bool IsEnd { get; set; } public List<ItemTree> Child { get; set; } } /// <summary> /// 詞庫樹 /// </summary> public ItemTree Library { get; private set; } /// <summary> /// 敏感片語 /// </summary> public string[] Words { get; protected set; } /// <summary> /// 敏感詞庫 /// </summary> public WordsLibrary() { LoadWords(); Init(); } /// <summary> /// 敏感詞庫 /// </summary> /// <param name="words">敏感片語</param> public WordsLibrary(string[] words) : this() { Words = words; } /// <summary> /// 載入 敏感片語,可被重寫以自定義 如何載入 敏感片語 /// </summary> public virtual void LoadWords() { } /// <summary> /// 詞庫初始化 /// </summary> private void Init() { if (Words == null) Words = new[] { "" }; Library = new ItemTree() { Item = 'R', IsEnd = false, Child = CreateTree(Words) }; } /// <summary> /// 創建詞庫樹 /// </summary> /// <param name="words">敏感片語</param> /// <returns></returns> private List<ItemTree> CreateTree(string[] words) { List<ItemTree> tree = null; if (words != null && words.Length > 0) { tree = new List<ItemTree>(); foreach (var item in words) if (!string.IsNullOrEmpty(item)) { char cha = item[0]; ItemTree node = tree.Find(e => e.Item == cha); if (node != null) AddChildTree(node, item); else tree.Add(CreateSingleTree(item)); } } return tree; } /// <summary> /// 創建單個完整樹 /// </summary> /// <param name="word">單個敏感詞</param> /// <returns></returns> private ItemTree CreateSingleTree(string word) { //根節點,此節點 值為空 ItemTree root = new ItemTree(); //移動 游標 ItemTree p = root; for (int i = 0; i < word.Length; i++) { ItemTree child = new ItemTree() { Item = word[i], IsEnd = false, Child = null }; p.Child = new List<ItemTree>() { child }; p = child; } p.IsEnd = true; return root.Child.First(); } /// <summary> /// 附加分支子樹 /// </summary> /// <param name="childTree">子樹</param> /// <param name="word">單個敏感詞</param> private void AddChildTree(ItemTree childTree, string word) { //移動 游標 ItemTree p = childTree; for (int i = 1; i < word.Length; i++) { char cha = word[i]; List<ItemTree> child = p.Child; if (child == null) { ItemTree node = new ItemTree() { Item = cha, IsEnd = false, Child = null }; p.Child = new List<ItemTree>() { node }; p = node; } else { ItemTree node = child.Find(e => e.Item == cha); if (node == null) { node = new ItemTree() { Item = cha, IsEnd = false, Child = null }; child.Add(node); p = node; } else p = node; } } p.IsEnd = true; } } }
第二步,構建 敏感詞檢測(ContentCheck) 類:
using System.Collections.Generic; using System.Linq; using System; namespace ContentSafe.SensitiveWord { /// <summary> /// 敏感詞檢測 /// </summary> public class ContentCheck { /// <summary> /// 檢測文本 /// </summary> public string Text { private get; set; } /// <summary> /// 敏感詞庫 詞樹 /// </summary> public WordsLibrary.ItemTree Library { private get; set; } /// <summary> /// 敏感詞檢測 /// </summary> public ContentCheck() { } /// <summary> /// 敏感詞檢測 /// </summary> /// <param name="library">敏感詞庫</param> public ContentCheck(WordsLibrary library) { if (library.Library == null) throw new Exception("敏感詞庫未初始化"); Library = library.Library; } /// <summary> /// 敏感詞檢測 /// </summary> /// <param name="library">敏感詞庫</param> /// <param name="text">檢測文本</param> public ContentCheck(WordsLibrary library, string text) : this(library) { if (text == null) throw new Exception("檢測文本不能為null"); Text = text; } /// <summary> /// 檢測敏感詞 /// </summary> /// <param name="text">檢測文本</param> /// <returns></returns> private Dictionary<int, char> WordsCheck(string text) { if (Library == null) throw new Exception("未設置敏感詞庫 詞樹"); Dictionary<int, char> dic = new Dictionary<int, char>(); WordsLibrary.ItemTree p = Library; List<int> indexs = new List<int>(); for (int i = 0, j = 0; j < text.Length; j++) { char cha = text[j]; var child = p.Child; var node = child.Find(e => e.Item == cha); if (node != null) { indexs.Add(j); if (node.IsEnd || node.Child == null) { if (node.Child != null) { int k = j + 1; if (k < text.Length && node.Child.Exists(e => e.Item == text[k])) { p = node; continue; } } foreach (var item in indexs) dic.Add(item, text[item]); indexs.Clear(); p = Library; i = j; ++i; } else p = node; } else { indexs.Clear(); if (p.GetHashCode() != Library.GetHashCode()) { ++i; j = i; p = Library; } else i = j; } } return dic; } /// <summary> /// 替換敏感詞 /// </summary> /// <param name="library">敏感詞庫</param> /// <param name="text">檢測文本</param> /// <param name="newChar">替換字元</param> /// <returns></returns> public static string SensitiveWordsReplace(WordsLibrary library, string text, char newChar = '*') { Dictionary<int, char> dic = new ContentCheck(library).WordsCheck(text); if (dic != null && dic.Keys.Count > 0) { char[] chars = text.ToCharArray(); foreach (var item in dic) chars[item.Key] = newChar; text = new string(chars); } return text; } /// <summary> /// 替換敏感詞 /// </summary> /// <param name="text">檢測文本</param> /// <param name="newChar">替換字元</param> /// <returns></returns> public string SensitiveWordsReplace(string text, char newChar = '*') { Dictionary<int, char> dic = WordsCheck(text); if (dic != null && dic.Keys.Count > 0) { char[] chars = text.ToCharArray(); foreach (var item in dic) chars[item.Key] = newChar; text = new string(chars); } return text; } /// <summary> /// 替換敏感詞 /// </summary> /// <param name="newChar">替換字元</param> /// <returns></returns> public string SensitiveWordsReplace(char newChar = '*') { if (Text == null) throw new Exception("未設置檢測文本"); return SensitiveWordsReplace(Text, newChar); } /// <summary> /// 查找敏感詞 /// </summary> /// <param name="library">敏感詞庫</param> /// <param name="text">檢測文本</param> /// <returns></returns> public static List<string> FindSensitiveWords(WordsLibrary library, string text) { ContentCheck check = new ContentCheck(library, text); return check.FindSensitiveWords(); } /// <summary> /// 查找敏感詞 /// </summary> /// <param name="text">檢測文本</param> /// <returns></returns> public List<string> FindSensitiveWords(string text) { Dictionary<int, char> dic = WordsCheck(text); if (dic != null && dic.Keys.Count > 0) { int i = -1; string str = ""; List<string> list = new List<string>(); foreach(var item in dic) { if (i == -1 || i + 1 == item.Key) str += item.Value; else { list.Add(str); str = "" + item.Value; } i = item.Key; } list.Add(str); return list.Distinct().ToList(); } else return null; } /// <summary> /// 查找敏感詞 /// </summary> /// <returns></returns> public List<string> FindSensitiveWords() { if (Text == null) throw new Exception("未設置檢測文本"); return FindSensitiveWords(Text); } } }
第三步,測試與使用方法:
string[] words = new[] { "敏感詞1", "敏感詞2", "含有", "垃圾" }; //敏感片語 可自行在網上 搜索下載 //敏感詞庫 類可被繼承,如果想實現自定義 敏感詞導入方法 可以 對 LoadWords 方法進行 重寫 var library = new WordsLibrary(words); //實例化 敏感詞庫 string text = "在任意一個文本中都可能包含敏感詞1、2、3等等,只要含有敏感詞都會被找出來,比如:垃圾"; ContentCheck check = new ContentCheck(library, text); //實例化 內容檢測類 var list = check.FindSensitiveWords(); //調用 查找敏感詞方法 返回敏感詞列表 var str = check.SensitiveWordsReplace(); //調用 敏感詞替換方法 返回處理過的字元串
該 實現方案 不止 這個 使用方法,更多使用方法 可自行 研究