欧美成人免费全部观看在线看,久久精品99视频,狠狠做狠狠做综合日日

一、前言

開發中經常要處理用戶一些文字的提交，所以涉及到了敏感詞過濾的功能，參考資料中dfa有窮狀態機算法的實現，創建有向圖。完成了對敏感詞、廣告詞的過濾，而且效率較好，所以分享一下。

具體實現：

1、匹配大小寫過濾
2、匹配全角半角過濾
3、匹配過濾停頓詞過濾。
4、敏感詞重復詞過濾。

例如：

支持如下類型類型過濾檢測：

fuck 全小寫

fuck 大小寫

ｆｕｃｋ全角半角

f!!!u&c ###k 停頓詞

fffuuuucccckkk 重復詞

敏感詞過濾的做法有很多，我簡單描述我現在理解的幾種：

①查詢數據庫當中的敏感詞，循環每一個敏感詞，然后去輸入的文本中從頭到尾搜索一遍，看是否存在此敏感詞，有則做相

應的處理，這種方式講白了就是找到一個處理一個。

優點：so easy。用java代碼實現基本沒什么難度。

缺點：這效率讓我心中奔過十萬匹草泥馬，而且匹配的是不是有些蛋疼，如果是英文時你會發現一個很無語的事情，比如英文

a是敏感詞，那我如果是一篇英文文檔，那程序它妹的得處理多少次敏感詞？誰能告訴我？

②傳說中的dfa算法（有窮自動機）,也正是我要給大家分享的，畢竟感覺比較通用，算法的原理希望大家能夠自己去網上查查

資料，這里就不詳細說明了。

優點：至少比上面那sb效率高點。

缺點：對于學過算法的應該不難，對于沒學過算法的用起來也不難，就是理解起來有點gg疼，匹配效率也不高，比較耗費內存，

敏感詞越多，內存占用的就越大。

③第三種在這里要特別說明一下，那就是你自己去寫一個算法吧，或者在現有的算法的基礎上去優化，這也是小alan追求的至

高境界之一，如果哪位淫兄有自己的想法一定別忘了小alan，可以加小alan的qq：810104041教小alan兩招耍耍。

二、代碼實現

其目錄結構如下：

Java實現DFA算法對敏感詞、廣告詞過濾功能示例

其中resources資源目錄中：

stopwd.txt ：停頓詞，匹配時間直接過濾。

wd.txt：敏感詞庫。

1、wordfilter敏感詞過濾類

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

									package org.andy.sensitivewdfilter; 

									import java.io.bufferedreader; 

									import java.io.ioexception; 

									import java.io.inputstreamreader; 

									import java.util.arraylist; 

									import java.util.hashmap; 

									import java.util.hashset; 

									import java.util.list; 

									import java.util.map; 

									import java.util.set; 

									import org.andy.sensitivewdfilter.util.bcconvert; 

									/** 

									 * 創建時間：2016年8月30日 下午3:01:12 

									 * 

									 * 思路： 創建一個filterset，枚舉了0~65535的所有char是否是某個敏感詞開頭的狀態 

									 * 

									 * 判斷是否是 敏感詞開頭 | | 是 不是 獲取頭節點 ok--下一個字 然后逐級遍歷，dfa算法 

									 * 

									 * @author andy 

									 * @version 2.2 

									 */

									public class wordfilter { 

									  private static final filterset set = new filterset(); // 存儲首字 

									  private static final map<integer, wordnode> nodes = new hashmap<integer, wordnode>(1024, 1); // 存儲節點 

									  private static final set<integer> stopwdset = new hashset<>(); // 停頓詞 

									  private static final char sign = '*'; // 敏感詞過濾替換 

									  static { 

									    try { 

									      long a = system.nanotime(); 

									      init(); 

									      a = system.nanotime() - a; 

									      system.out.println("加載時間 : " + a + "ns"); 

									      system.out.println("加載時間 : " + a / 1000000 + "ms"); 

									    } catch (exception e) { 

									      throw new runtimeexception("初始化過濾器失敗"); 

									    } 

									  } 

									  private static void init() { 

									    // 獲取敏感詞 

									    addsensitiveword(readwordfromfile("wd.txt")); 

									    addstopword(readwordfromfile("stopwd.txt")); 

									  } 

									  /** 

									   * 增加敏感詞 

									   * @param path 

									   * @return 

									   */

									  private static list<string> readwordfromfile(string path) { 

									    list<string> words; 

									    bufferedreader br = null; 

									    try { 

									      br = new bufferedreader(new inputstreamreader(wordfilter.class.getclassloader().getresourceasstream(path))); 

									      words = new arraylist<string>(1200); 

									      for (string buf = ""; (buf = br.readline()) != null;) { 

									        if (buf == null || buf.trim().equals("")) 

									          continue; 

									        words.add(buf); 

									      } 

									    } catch (exception e) { 

									      throw new runtimeexception(e); 

									    } finally { 

									      try { 

									        if (br != null) 

									          br.close(); 

									      } catch (ioexception e) { 

									      } 

									    } 

									    return words; 

									  } 

									  /** 

									   * 增加停頓詞 

									   * 

									   * @param words 

									   */

									  private static void addstopword(final list<string> words) { 

									    if (words != null && words.size() > 0) { 

									      char[] chs; 

									      for (string curr : words) { 

									        chs = curr.tochararray(); 

									        for (char c : chs) { 

									          stopwdset.add(charconvert(c)); 

									        } 

									      } 

									    } 

									  } 

									  /** 

									   * 添加dfa節點 

									   * @param words 

									   */

									  private static void addsensitiveword(final list<string> words) { 

									    if (words != null && words.size() > 0) { 

									      char[] chs; 

									      int fchar; 

									      int lastindex; 

									      wordnode fnode; // 首字母節點 

									      for (string curr : words) { 

									        chs = curr.tochararray(); 

									        fchar = charconvert(chs[0]); 

									        if (!set.contains(fchar)) {// 沒有首字定義 

									          set.add(fchar);// 首字標志位 可重復add,反正判斷了，不重復了 

									          fnode = new wordnode(fchar, chs.length == 1); 

									          nodes.put(fchar, fnode); 

									        } else { 

									          fnode = nodes.get(fchar); 

									          if (!fnode.islast() && chs.length == 1) 

									            fnode.setlast(true); 

									        } 

									        lastindex = chs.length - 1; 

									        for (int i = 1; i < chs.length; i++) { 

									          fnode = fnode.addifnoexist(charconvert(chs[i]), i == lastindex); 

									        } 

									      } 

									    } 

									  } 

									  /** 

									   * 過濾判斷 將敏感詞轉化為成屏蔽詞 

									   * @param src 

									   * @return 

									   */

									  public static final string dofilter(final string src) { 

									    char[] chs = src.tochararray(); 

									    int length = chs.length; 

									    int currc; 

									    int k; 

									    wordnode node; 

									    for (int i = 0; i < length; i++) { 

									      currc = charconvert(chs[i]); 

									      if (!set.contains(currc)) { 

									        continue; 

									      } 

									      node = nodes.get(currc);// 日 2 

									      if (node == null)// 其實不會發生，習慣性寫上了 

									        continue; 

									      boolean couldmark = false; 

									      int marknum = -1; 

									      if (node.islast()) {// 單字匹配（日） 

									        couldmark = true; 

									        marknum = 0; 

									      } 

									      // 繼續匹配（日你/日你妹），以長的優先 

									      // 你-3 妹-4 夫-5 

									      k = i; 

									      for (; ++k < length;) { 

									        int temp = charconvert(chs[k]); 

									        if (stopwdset.contains(temp)) 

									          continue; 

									        node = node.querysub(temp); 

									        if (node == null)// 沒有了 

									          break; 

									        if (node.islast()) { 

									          couldmark = true; 

									          marknum = k - i;// 3-2 

									        } 

									      } 

									      if (couldmark) { 

									        for (k = 0; k <= marknum; k++) { 

									          chs[k + i] = sign; 

									        } 

									        i = i + marknum; 

									      } 

									    } 

									    return new string(chs); 

									  } 

									  /** 

									   * 是否包含敏感詞 

									   * @param src 

									   * @return 

									   */

									  public static final boolean iscontains(final string src) { 

									    char[] chs = src.tochararray(); 

									    int length = chs.length; 

									    int currc; 

									    int k; 

									    wordnode node; 

									    for (int i = 0; i < length; i++) { 

									      currc = charconvert(chs[i]); 

									      if (!set.contains(currc)) { 

									        continue; 

									      } 

									      node = nodes.get(currc);// 日 2 

									      if (node == null)// 其實不會發生，習慣性寫上了 

									        continue; 

									      boolean couldmark = false; 

									      if (node.islast()) {// 單字匹配（日） 

									        couldmark = true; 

									      } 

									      // 繼續匹配（日你/日你妹），以長的優先 

									      // 你-3 妹-4 夫-5 

									      k = i; 

									      for (; ++k < length;) { 

									        int temp = charconvert(chs[k]); 

									        if (stopwdset.contains(temp)) 

									          continue; 

									        node = node.querysub(temp); 

									        if (node == null)// 沒有了 

									          break; 

									        if (node.islast()) { 

									          couldmark = true; 

									        } 

									      } 

									      if (couldmark) { 

									        return true; 

									      } 

									    } 

									    return false; 

									  } 

									  /** 

									   * 大寫轉化為小寫 全角轉化為半角 

									   * 

									   * @param src 

									   * @return 

									   */

									  private static int charconvert(char src) { 

									    int r = bcconvert.qj2bj(src); 

									    return (r >= 'a' && r <= 'z') ? r + 32 : r; 

									  } 

									}

其中：

iscontains ：是否包含敏感詞
dofilter：過濾敏感詞

2、wordnode敏感詞節點

									package org.andy.sensitivewdfilter; 

									import java.util.linkedlist; 

									import java.util.list; 

									/** 

									 * 創建時間：2016年8月30日 下午3:07:45 

									 * 

									 * @author andy 

									 * @version 2.2 

									 */

									public class wordnode { 

									  private int value; // 節點名稱 

									  private list<wordnode> subnodes; // 子節點 

									  private boolean islast;// 默認false 

									  public wordnode(int value) { 

									    this.value = value; 

									  } 

									  public wordnode(int value, boolean islast) { 

									    this.value = value; 

									    this.islast = islast; 

									  } 

									  /** 

									   * 

									   * @param subnode 

									   * @return 就是傳入的subnode 

									   */

									  private wordnode addsubnode(final wordnode subnode) { 

									    if (subnodes == null) 

									      subnodes = new linkedlist<wordnode>(); 

									    subnodes.add(subnode); 

									    return subnode; 

									  } 

									  /** 

									   * 有就直接返回該子節點， 沒有就創建添加并返回該子節點 

									   * 

									   * @param value 

									   * @return 

									   */

									  public wordnode addifnoexist(final int value, final boolean islast) { 

									    if (subnodes == null) { 

									      return addsubnode(new wordnode(value, islast)); 

									    } 

									    for (wordnode subnode : subnodes) { 

									      if (subnode.value == value) { 

									        if (!subnode.islast && islast) 

									          subnode.islast = true; 

									        return subnode; 

									      } 

									    } 

									    return addsubnode(new wordnode(value, islast)); 

									  } 

									  public wordnode querysub(final int value) { 

									    if (subnodes == null) { 

									      return null; 

									    } 

									    for (wordnode subnode : subnodes) { 

									      if (subnode.value == value) 

									        return subnode; 

									    } 

									    return null; 

									  } 

									  public boolean islast() { 

									    return islast; 

									  } 

									  public void setlast(boolean islast) { 

									    this.islast = islast; 

									  } 

									  @override

									  public int hashcode() { 

									    return value; 

									  } 

									}