本文實例講述了java使用dfa算法實現(xiàn)過濾多家公司自定義敏感字功能。分享給大家供大家參考,具體如下:
背景
因為最近有通訊有個需求,說需要讓多家客戶公司可以自定義敏感詞過濾掉他們自定義的規(guī)則,選擇了dfa算法來做,不過和以前傳統(tǒng)了dfa寫法不太一樣了
模式圖
直接上代碼
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
|
public class keywordfilter { // private static reentrantreadwritelock lock = new reentrantreadwritelock(); public static map<string, hashmap> currentmap = new concurrenthashmap<string, hashmap>(); public static map nowhash = null ; public static object wordmap; // map子節(jié)點 // 不建立對象 private keywordfilter() { } private static string getkey( int companyid) { return "companyid" + companyid; } /* * <p>說明:清掃內(nèi)容</p> * * @author:姚旭民 * * @data:2017-8-22 上午10:13:11 */ public static void clear() { try { currentmap.clear(); } catch (exception e) { e.printstacktrace(); } finally { } } /* * <p>說明:各個渠道的過濾字符</p> * * @author:姚旭民 * * @data:2017-8-20 下午2:55:06 */ public static void savekeywords(int companyid, list<string> keywords) { try { map tempallmap = currentmap; string key = getkey(companyid); int l = keywords.size(); int il; map tempmap; for (int i = 0; i < l; i++) { string key2 = keywords.get(i).trim();// 去掉空白 nowhash = currentmap; il = key2.length(); for (int j = 0; j < il; j++) { char word = key2.charat(j); tempmap = (map) nowhash.get(word); wordmap = nowhash.get(word); if (wordmap != null) {// 檢查數(shù)據(jù) if (!tempmap.containskey(key)) { nowhash.put(key, 0); } nowhash = (hashmap) wordmap; } else { hashmap<string, string> newwordhash = new hashmap<string, string>(); newwordhash.put(key, "0"); nowhash.put(word, newwordhash); nowhash = newwordhash; } if (j == il - 1) { nowhash.put(key, "1"); } } } } catch (exception e) { e.printstacktrace(); } finally { nowhash = null; wordmap = null; } } /* * <p>說明:替換掉對應(yīng)的渠道規(guī)定掉敏感字</p> * * @author:姚旭民 * * @data:2017-8-20 上午11:41:47 */ public static list<string> repword(int companyid, string txt) { map tempmap = currentmap; list<string> result = new arraylist<string>(); string key = getkey(companyid); nowhash = currentmap; int l = txt.length(); char word; string keywordstr = ""; string keystatu; stringbuilder keyword = new stringbuilder();// 敏感字 for (int i = 0; i < l; i++) { word = txt.charat(i); wordmap = nowhash.get(word); if (wordmap != null) {// 找到類似敏感字的字體,開始查詢 keyword.append(word); object te = nowhash = (hashmap) wordmap; // 遍歷到這一步,就符合完整的關(guān)鍵字模板 if (nowhash.get(key) != null && nowhash.get(key).tostring().equals("1")) {// 確定是敏感字,開始替換 if (i < l - 1 && nowhash.get(txt.charat(i + 1)) != null) {// 優(yōu)先過濾長敏感詞,去掉就檳城了優(yōu)先過濾段敏感詞 continue; } txt = txt.replaceall(keyword.tostring(), "*"); nowhash = currentmap; keywordstr += keyword.tostring() + ","; i = i - keyword.length() + 1; l = txt.length();// 重新獲取字符長度 keyword.delete(0, keyword.length());// 清空數(shù)據(jù) } } else {// 這個字不是敏感字,直接排除 nowhash = currentmap; keyword.delete(0, keyword.length());// 清空數(shù)據(jù) continue; } } // 清除內(nèi)存指向 nowhash = null; wordmap = null; result.add(txt); result.add(keywordstr.length() - 1 > 0 ? keywordstr.substring(0, keywordstr.length() - 1) : keywordstr); return result; } /* * <p>說明:檢查是否存在敏感字</p> * * @author:姚旭民 * * @data:2017-8-20 下午3:00:06 專門設(shè)計成私有的,如果沒有理由,別改動他 */ private static int checkkeywords(string txt, int companyid, int begin) { int result = 0; string key = getkey(companyid); try { nowhash = currentmap; int l = txt.length(); char word = 0; for (int i = begin; i < l; i++) { word = txt.charat(i); wordmap = nowhash.get(word); if (wordmap != null) { result++; nowhash = (hashmap) wordmap; if (((string) nowhash.get(key)).equals("1")) { nowhash = null; wordmap = null; return result; } } else { result = 0; break; } } } catch (exception e) { e.printstacktrace(); } finally { nowhash = null; wordmap = null; return result; } } /* * <p>說明:返回檢查的文本中包含的敏感字</p> * * @author:姚旭民 * * @data:2017-8-20 下午3:32:53 */ public static string gettxtkeywords(string txt, int companyid) { string result = null; stringbuilder temp = new stringbuilder(); string key; int l = txt.length(); for (int i = 0; i < l;) { int len = checkkeywords(txt, companyid, i); if (len > 0) { key = (txt.substring(i, i + len));// 挑選出來的關(guān)鍵字 temp.append(key + ","); txt = txt.replaceall(key, "");// 挑選出來的關(guān)鍵字替換成空白,加快挑選速度 l = txt.length(); } else { i++; } } if (temp.length() > 0) { result = temp.substring(0, temp.length() - 1); } return result; } /* * <p>說明:判斷文中是否包含渠道規(guī)定的敏感字</p> * * @author:姚旭民 * * @data:2017-8-20 下午3:33:19 */ public boolean iskeywords(string txt, int companyid) { for ( int i = 0 ; i < txt.length(); i++) { int len = checkkeywords(txt, companyid, i); if (len > 0 ) { return true ; } } return false ; } public static void main(string[] arg) { list<string> keywords = new arraylist<string>(); keywords.add( "傻×" ); keywords.add( "漢奸" ); keywords.add( "草" ); keywords.add( "草泥馬" ); keywordfilter.savekeywords( 1 , keywords); string txt = "是傻×漢奸傻a傻b傻c傻d漢奸傻×草泥馬" ; list<string> list = repword( 1 , txt); system.out.println( "文中包含的敏感字為:" + list.get( 1 )); system.out.println( "原文:" + txt); system.out.println( "敏感字過濾后:" + list.get( 0 )); } } |
希望本文所述對大家java程序設(shè)計有所幫助。
原文鏈接:https://my.oschina.net/grkj/blog/1522696