想持久運營一款web或移動端的產品,對內容進行必要的把關必不可少。這裡分享一個基於DFA演算法的高性能的敏感詞,臟詞的檢測過濾演算法類(c#). ...
【概述】做好一個web系統的安全運維,除了常規的防註入,防入侵等,還有一個檢測並過濾敏感詞,臟詞.. 這件事做得不好,輕則導致一場投訴或糾紛,重則導致產品被勒令關閉停運。
廢話少說,先看下代碼,可以拿過去直接使用。
1 using Microsoft.VisualBasic; 2 using System; 3 using System.Collections.Generic; 4 using System.IO; 5 using System.Linq; 6 using System.Text; 7 8 namespace OpenCore.ContentSecurity 9 { 10 /// <summary> 11 /// 功能簡介:基於DFA演算法的高效率非法關鍵詞檢測過濾類(杜絕違法內容)..之所以高效,因為本演算法對主輸入的字元串,只迴圈了一次。 無需對詞庫的每個詞進行replace的低效率處理。 12 /// 開發前參考內容:https://blog.csdn.net/u011966339/article/details/72832197 13 /// 更新日誌: 14 /// 2020-4-15:載入字典的處理採用靜態構造方法中處理,避免頻繁載入,提升性能. 15 /// 支持多詞庫文件載入. 16 /// 優化了演算法的細節,提高健壯性。 17 /// </summary> 18 public class SensitiveWordFilter 19 { 20 private static string[] dictionaryPathList = null; 21 /// <summary> 22 /// 記憶體詞典 23 /// </summary> 24 private static WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue]; 25 private static object lockObj = new object(); 26 public static void Init(string[] sDictionaryFileName) 27 { 28 dictionaryPathList = sDictionaryFileName; 29 LoadDictionary(); 30 } 31 public SensitiveWordFilter() 32 { 33 34 } 35 private string sourctText = string.Empty; 36 /// <summary> 37 /// 檢測源 38 /// </summary> 39 private string SourctText 40 { 41 get { return sourctText; } 42 set { sourctText = value; } 43 } 44 /// <summary> 45 /// 檢測源游標 46 /// </summary> 47 private int cursor = 0; 48 /// <summary> 49 /// 匹配成功後偏移量 50 /// </summary> 51 private int wordlenght = 0; 52 /// <summary> 53 /// 檢測詞游標 54 /// </summary> 55 private int nextCursor = 0; 56 private List<string> illegalWords = new List<string>(); 57 /// <summary> 58 /// 檢測到的非法詞集 59 /// </summary> 60 public List<string> IllegalWords 61 { 62 get { return illegalWords; } 63 } 64 /// <summary> 65 /// 判斷是否是中文 66 /// </summary> 67 /// <param name="character"></param> 68 /// <returns></returns> 69 private bool isCHS(char character) 70 { 71 // 中文表意字元的範圍 4E00-9FA5 72 int charVal = (int)character; 73 return (charVal >= 0x4e00 && charVal <= 0x9fa5); 74 } 75 /// <summary> 76 /// 判斷是否是數字 77 /// </summary> 78 /// <param name="character"></param> 79 /// <returns></returns> 80 private bool isNum(char character) 81 { 82 int charVal = (int)character; 83 return (charVal >= 48 && charVal <= 57); 84 } 85 /// <summary> 86 /// 判斷是否是字母 87 /// </summary> 88 /// <param name="character"></param> 89 /// <returns></returns> 90 private bool isAlphabet(char character) 91 { 92 int charVal = (int)character; 93 return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90)); 94 } 95 /// <summary> 96 /// 轉半形小寫的函數(DBC case) 97 /// </summary> 98 /// <param name="input">任意字元串</param> 99 /// <returns>半形字元串</returns> 100 ///<remarks> 101 ///全形空格為12288,半形空格為32 102 ///其他字元半形(33-126)與全形(65281-65374)的對應關係是:均相差65248 103 ///</remarks> 104 private static string ToDBC(string input) 105 { 106 char[] c = input.ToCharArray(); 107 for (int i = 0; i < c.Length; i++) 108 { 109 if (c[i] == 12288) 110 { 111 c[i] = (char)32; 112 continue; 113 } 114 if (c[i] > 65280 && c[i] < 65375) 115 c[i] = (char)(c[i] - 65248); 116 } 117 return new string(c).ToLower(); 118 } 119 /// <summary> 120 /// 轉換為簡體中文 121 /// </summary> 122 /// <param name="sInput"></param> 123 /// <returns></returns> 124 private static string ToSimplifiedChiniese(string sInput) 125 { 126 if (string.IsNullOrEmpty(sInput)) 127 { 128 return string.Empty; 129 } 130 try 131 { 132 return Strings.StrConv(sInput, VbStrConv.SimplifiedChinese, 0); 133 } 134 catch (Exception ex) 135 { 136 137 } 138 return sInput; 139 } 140 /// <summary> 141 /// 寫入日誌(非跨程式域的場景) 142 /// </summary> 143 /// <param name="Msg"></param> 144 private static void SaveLog(string Msg) 145 { 146 string sPath = Path.Combine(AppDomain.CurrentDomain.SetupInformation.ApplicationBase, "SecurityLog"); 147 if (!Directory.Exists(sPath)) 148 { 149 Directory.CreateDirectory(sPath); 150 } 151 sPath = string.Format("{0}\\{1}", sPath, DateTime.Now.ToString("yyyyMMdd") + ".log"); 152 try 153 { 154 File.AppendAllText(sPath, "[" + DateTime.Now.ToString() + "]" + Msg + "\r\n"); 155 } 156 catch 157 { 158 } 159 } 160 /// <summary> 161 /// 載入記憶體詞庫 162 /// </summary> 163 private static void LoadDictionary() 164 { 165 if (dictionaryPathList == null || dictionaryPathList.Length == 0) 166 { 167 SaveLog($"SensitiveWordFilter.LoadDictionary.字典路徑配置為空"); 168 return; 169 } 170 foreach (string sFileName in dictionaryPathList) 171 { 172 if (File.Exists(sFileName) == false) 173 { 174 SaveLog($"SensitiveWordFilter.LoadDictionary.路徑:{sFileName}不是一個有效的文件"); 175 return; 176 } 177 } 178 List<string> wordList = new List<string>(); 179 Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length); 180 foreach (string sDictionaryFile in dictionaryPathList) 181 { 182 string[] words = System.IO.File.ReadAllLines(sDictionaryFile, System.Text.Encoding.Default); 183 foreach (string word in words) 184 { 185 if (string.IsNullOrEmpty(word)) 186 continue; 187 if (word.Trim().Length == 0) 188 continue; 189 string key = ToDBC(word); 190 wordList.Add(key); 191 //適配繁體,簡體.addbyww@2020-4-15 192 string key_simple = ToSimplifiedChiniese(key); 193 if (key_simple != key) 194 { 195 wordList.Add(key_simple); 196 } 197 } 198 } 199 Comparison<string> cmp = delegate (string key1, string key2) 200 { 201 return key1.CompareTo(key2); 202 }; 203 wordList.Sort(cmp); 204 for (int i = wordList.Count - 1; i > 0; i--) 205 { 206 if (wordList[i].ToString() == wordList[i - 1].ToString()) 207 { 208 wordList.RemoveAt(i); 209 } 210 } 211 foreach (var word in wordList) 212 { 213 if (word.Length > 0) 214 { 215 WordGroup group = MEMORYLEXICON[(int)word[0]]; 216 if (group == null) 217 { 218 group = new WordGroup(); 219 MEMORYLEXICON[(int)word[0]] = group; 220 } 221 group.Add(word.Substring(1)); 222 } 223 } 224 } 225 /// <summary> 226 /// 檢測 227 /// </summary> 228 /// <param name="blackWord"></param> 229 /// <returns></returns> 230 private bool Check(string blackWord) 231 { 232 wordlenght = 0; 233 //檢測源下一位游標 234 nextCursor = cursor + 1; 235 bool found = false; 236 //遍歷詞的每一位做匹配 237 for (int i = 0; i < blackWord.Length; i++) 238 { 239 //特殊字元偏移游標 240 int offset = 0; 241 if (nextCursor >= sourctText.Length) 242 { 243 break; 244 } 245 else 246 { 247 //檢測下位字元如果不是漢字 數字 字元 偏移量加1 248 for (int y = nextCursor; y < sourctText.Length; y++) 249 { 250 251 if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y])) 252 { 253 offset++; 254 //避讓特殊字元,下位游標如果>=字元串長度 跳出 255 if (nextCursor + offset >= sourctText.Length) break; 256 wordlenght++; 257 } 258 else break; 259 } 260 if ((int)blackWord[i] == (int)sourctText[nextCursor + offset]) 261 { 262 found = true; 263 } 264 else 265 { 266 found = false; 267 break; 268 } 269 } 270 nextCursor = nextCursor + 1 + offset; 271 wordlenght++; 272 } 273 return found; 274 } 275 /// <summary> 276 /// 檢測並替換敏感詞為指定字元。之後返回 277 /// </summary> 278 /// <param name="replaceChar">比如:*</param> 279 public string getDataByFilter(string sSourceInput, char replaceChar) 280 { 281 if (string.IsNullOrEmpty(sSourceInput)) 282 { 283 return sSourceInput; 284 } 285 if (MEMORYLEXICON == null || MEMORYLEXICON.Length == 0) 286 { 287 SaveLog($"SensitiveWordFilter.getDataByFilter.記憶體字典為空"); 288 return sSourceInput; 289 } 290 //初始化 291 this.cursor = 0; 292 this.wordlenght = 0; 293 this.illegalWords.Clear(); 294 this.sourctText = sSourceInput; 295 if (sourctText != string.Empty) 296 { 297 char[] tempString = sourctText.ToCharArray(); 298 for (int i = 0; i < SourctText.Length; i++) 299 { 300 //查詢以該字為首字元的片語 301 WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]]; 302 if (group != null) 303 { 304 for (int z = 0; z < group.Count(); z++) 305 { 306 string word = group.GetWord(z); 307 if (word.Length == 0 || Check(word)) 308 { 309 string blackword = string.Empty; 310 for (int pos = 0; pos < wordlenght + 1; pos++) 311 { 312 blackword += tempString[pos + cursor].ToString(); 313 tempString[pos + cursor] = replaceChar; 314 } 315 illegalWords.Add(blackword); 316 cursor = cursor + wordlenght; 317 i = i + wordlenght; 318 } 319 } 320 } 321 cursor++; 322 } 323 return new string(tempString); 324 } 325 else 326 { 327 return string.Empty; 328 } 329 } 330 } 331 /// <summary> 332 /// 具有相同首字元的片語集合 333 /// </summary> 334 public class WordGroup 335 { 336 /// <summary> 337 /// 集合 338 /// </summary> 339 private List<string> groupList=new List<string>(); 340 public WordGroup() 341 { 342 343 } 344 /// <summary> 345 /// 添加詞 346 /// </summary> 347 /// <param name="word"></param> 348 public void Add(string word) 349 { 350 if (groupList.Contains(word) == false) 351 { 352 groupList.Add(word); 353 } 354 } 355 /// <summary> 356 /// 獲取總數 357 /// </summary> 358 /// <returns></returns> 359 public int Count() 360 { 361 return groupList.Count; 362 } 363 /// <summary> 364 /// 根據下標獲取詞 365 /// </summary> 366 /// <param name="index"></param> 367 /// <returns></returns> 368 public string GetWord(int index) 369 { 370 return groupList[index]; 371 } 372 } 373 }
上面是一個完整的,獨立的實現類。 下麵給一個簡單的調用示例:
1 //全局配置,整個程式只要配置一次即可,後續無需配置 2 SensitiveWordFilter.Init(new string[] { 3 @"C:\Users\x\Downloads\網站需要過濾的敏感詞\mgck-master\暴恐詞庫.txt", 4 @"C:\Users\x\Downloads\網站需要過濾的敏感詞\mgck-master\反動詞庫.txt", 5 @"C:\Users\x\Downloads\網站需要過濾的敏感詞\mgck-master\民生詞庫.txt", 6 @"C:\Users\x\Downloads\網站需要過濾的敏感詞\mgck-master\色情詞庫.txt", 7 @"C:\Users\x\Downloads\網站需要過濾的敏感詞\mgck-master\貪腐詞庫.txt", 8 @"C:\Users\x\Downloads\網站需要過濾的敏感詞\mgck-master\其他詞庫.txt" 9 });//註:這裡的路徑一定要寫正確,否則本演算法無法生效。 10 //下列可以在多個地方實例化,可以併發執行 11 SensitiveWordFilter wordFilter = new SensitiveWordFilter(); 12 Dictionary<string, string> dictTestData = new Dictionary<string, string>(); 13 //多測幾個示例,看看效果 14 dictTestData["殺^人游戲,有人找一夜q"] = "";//註意,這裡本來不是"一夜q",可惜咱們博客園本身也有敏感詞檢測,無法發佈。所以改成q。 如果有人需要測試,請在本地改為詞庫里的一些內容。!! 15 dictTestData["數學學習課堂"] = ""; 16 dictTestData["打擊法0功有,法0功毒害大眾"] = ""; 17 Dictionary<string, string> dictResult = new Dictionary<string, string>(); 18 foreach(string sKey in dictTestData.Keys) 19 { 20 dictResult[sKey] = $"替換後:{wordFilter.getDataByFilter(sKey,'|')}, ------------檢測違禁詞:{string.Join(",",(wordFilter.IllegalWords==null?new List<string>():wordFilter.IllegalWords))}"; 21 } 22 string sResultJson = JsonConverter.SerializeObject(dictResult); 23 Utils.SaveLog(sResultJson);
最後,給一下列印的結果:
"殺^人游戲,有人找一夜q": 替換後: "殺^人游戲,有人找|||", ------------檢測違禁詞:一夜q",
"數學學習課堂": 替換後:"數學學習課堂", ------------檢測違禁詞:,
"打擊法0功有,法0功毒害大眾": 替換後:"打擊|||有,|||毒害大眾", ------------檢測違禁詞:法0功,法0功"
-------------附
詞庫下載地址:https://codeload.github.com/chason777777/mgck/zip/master