|
|
@@ -0,0 +1,255 @@
|
|
|
+package com.fs.sensitive.manager;
|
|
|
+
|
|
|
+import com.fs.sensitive.service.ICompanyAiSensitiveWordService;
|
|
|
+import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
|
+import org.springframework.beans.factory.annotation.Autowired;
|
|
|
+import org.springframework.stereotype.Component;
|
|
|
+
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.Collections;
|
|
|
+import java.util.List;
|
|
|
+import java.util.TreeMap;
|
|
|
+import java.util.concurrent.ConcurrentHashMap;
|
|
|
+
|
|
|
+/**
|
|
|
+ * 敏感词 AC 自动机管理器(基于 com.hankcs:aho-corasick-double-array-trie)。
|
|
|
+ *
|
|
|
+ * <p>设计要点:
|
|
|
+ * <ul>
|
|
|
+ * <li>按 {@code tenantId} 维度缓存 {@link AhoCorasickDoubleArrayTrie},每个租户库一棵树。</li>
|
|
|
+ * <li>调用前请确保 ThreadLocal 数据源已切到对应租户库(动态数据源切面会自动生效)。</li>
|
|
|
+ * <li>{@link #evict(Long)} 用于后台 CRUD 时显式清空缓存,下次读取触发懒加载重建。</li>
|
|
|
+ * <li>{@link AhoCorasickDoubleArrayTrie} 一旦 build 完成不可增量修改,新增/删除敏感词必须重建整棵树。</li>
|
|
|
+ * </ul>
|
|
|
+ *
|
|
|
+ * <p>典型用法:
|
|
|
+ * <pre>
|
|
|
+ * if (sensitiveWordAcManager.hasSensitiveWord(tenantId, text)) {
|
|
|
+ * String highlighted = sensitiveWordAcManager.highlight(
|
|
|
+ * tenantId, text, "<span class=\"sensitive\">", "</span>");
|
|
|
+ * }
|
|
|
+ * </pre>
|
|
|
+ *
|
|
|
+ * @author fs
|
|
|
+ */
|
|
|
+@Slf4j
|
|
|
+@Component
|
|
|
+public class SensitiveWordAcManager {
|
|
|
+
|
|
|
+ /** 空树占位符,避免不停地针对空词库重复 build */
|
|
|
+ private static final AhoCorasickDoubleArrayTrie<String> EMPTY_TRIE = buildTrie(Collections.emptyList());
|
|
|
+
|
|
|
+ /** tenantId -> 该租户库的敏感词 AC 自动机 */
|
|
|
+ private final ConcurrentHashMap<Long, AhoCorasickDoubleArrayTrie<String>> cache = new ConcurrentHashMap<>();
|
|
|
+
|
|
|
+ @Autowired
|
|
|
+ private ICompanyAiSensitiveWordService companyAiSensitiveWordService;
|
|
|
+
|
|
|
+ // ---------------------------------------------------------
|
|
|
+ // 构建 / 缓存
|
|
|
+ // ---------------------------------------------------------
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 获取指定租户的 AC 自动机,不存在时懒加载构建。
|
|
|
+ *
|
|
|
+ * @param tenantId 租户 ID(不能为 null)
|
|
|
+ * @return 该租户的 AC 自动机
|
|
|
+ */
|
|
|
+ public AhoCorasickDoubleArrayTrie<String> get(Long tenantId) {
|
|
|
+ if (tenantId == null) {
|
|
|
+ return EMPTY_TRIE;
|
|
|
+ }
|
|
|
+ AhoCorasickDoubleArrayTrie<String> trie = cache.get(tenantId);
|
|
|
+ if (trie != null) {
|
|
|
+ return trie;
|
|
|
+ }
|
|
|
+ synchronized (cache) {
|
|
|
+ trie = cache.get(tenantId);
|
|
|
+ if (trie != null) {
|
|
|
+ return trie;
|
|
|
+ }
|
|
|
+ trie = loadFromDb(tenantId);
|
|
|
+ cache.put(tenantId, trie);
|
|
|
+ return trie;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 从数据库加载启用中的敏感词并构建 AC 自动机。
|
|
|
+ * 调用此方法前 ThreadLocal 数据源必须已切到对应租户库。
|
|
|
+ */
|
|
|
+ private AhoCorasickDoubleArrayTrie<String> loadFromDb(Long tenantId) {
|
|
|
+ try {
|
|
|
+ List<String> words = companyAiSensitiveWordService.selectAllEnabledWords();
|
|
|
+ if (words == null || words.isEmpty()) {
|
|
|
+ log.info("【敏感词缓存】tenantId={} 启用的敏感词为空,使用空树", tenantId);
|
|
|
+ return EMPTY_TRIE;
|
|
|
+ }
|
|
|
+ AhoCorasickDoubleArrayTrie<String> trie = buildTrie(words);
|
|
|
+ log.info("【敏感词缓存】tenantId={} 构建完成, 词条数={}", tenantId, words.size());
|
|
|
+ return trie;
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("【敏感词缓存】tenantId={} 加载失败,本次返回空树", tenantId, e);
|
|
|
+ return EMPTY_TRIE;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /** 把词表构建成一棵不可变的双数组 Trie,value 直接复用敏感词本身。 */
|
|
|
+ private static AhoCorasickDoubleArrayTrie<String> buildTrie(List<String> words) {
|
|
|
+ TreeMap<String, String> map = new TreeMap<>();
|
|
|
+ if (words != null) {
|
|
|
+ for (String w : words) {
|
|
|
+ if (StringUtils.isNotBlank(w)) {
|
|
|
+ map.put(w, w);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ AhoCorasickDoubleArrayTrie<String> trie = new AhoCorasickDoubleArrayTrie<>();
|
|
|
+ trie.build(map);
|
|
|
+ return trie;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 清除指定租户的缓存(后台增删改敏感词后调用,下次读取会重建)。
|
|
|
+ */
|
|
|
+ public void evict(Long tenantId) {
|
|
|
+ if (tenantId != null) {
|
|
|
+ cache.remove(tenantId);
|
|
|
+ log.info("【敏感词缓存】tenantId={} 缓存已清除", tenantId);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /** 清除全部租户缓存。 */
|
|
|
+ public void evictAll() {
|
|
|
+ cache.clear();
|
|
|
+ log.info("【敏感词缓存】全部租户缓存已清除");
|
|
|
+ }
|
|
|
+
|
|
|
+ // ---------------------------------------------------------
|
|
|
+ // 检测 / 标红 / 替换
|
|
|
+ // ---------------------------------------------------------
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 是否包含敏感词。出现任意一个即返回 true。
|
|
|
+ */
|
|
|
+ public boolean hasSensitiveWord(Long tenantId, String text) {
|
|
|
+ if (StringUtils.isBlank(text)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ AhoCorasickDoubleArrayTrie<String> trie = get(tenantId);
|
|
|
+ if (trie == EMPTY_TRIE) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ // parseText 找到第一个就抛异常以快速短路
|
|
|
+ final boolean[] found = new boolean[]{false};
|
|
|
+ AhoCorasickDoubleArrayTrie.IHit<String> hit = new AhoCorasickDoubleArrayTrie.IHit<String>() {
|
|
|
+ @Override
|
|
|
+ public void hit(int begin, int end, String value) {
|
|
|
+ found[0] = true;
|
|
|
+ throw new ShortCircuitException();
|
|
|
+ }
|
|
|
+ };
|
|
|
+ try {
|
|
|
+ trie.parseText(text, hit);
|
|
|
+ } catch (ShortCircuitException ignored) {
|
|
|
+ // 短路退出,无须额外处理
|
|
|
+ }
|
|
|
+ return found[0];
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 列出所有命中(包含位置区间,可用于前端标红/统计)。
|
|
|
+ */
|
|
|
+ public List<SensitiveWordHit> matchAll(Long tenantId, String text) {
|
|
|
+ if (StringUtils.isBlank(text)) {
|
|
|
+ return Collections.emptyList();
|
|
|
+ }
|
|
|
+ AhoCorasickDoubleArrayTrie<String> trie = get(tenantId);
|
|
|
+ if (trie == EMPTY_TRIE) {
|
|
|
+ return Collections.emptyList();
|
|
|
+ }
|
|
|
+ final List<SensitiveWordHit> hits = new ArrayList<>();
|
|
|
+ AhoCorasickDoubleArrayTrie.IHit<String> collector = new AhoCorasickDoubleArrayTrie.IHit<String>() {
|
|
|
+ @Override
|
|
|
+ public void hit(int begin, int end, String value) {
|
|
|
+ hits.add(new SensitiveWordHit(begin, end, value));
|
|
|
+ }
|
|
|
+ };
|
|
|
+ trie.parseText(text, collector);
|
|
|
+ return hits;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 返回标记后的文本,命中部分被 {@code prefixTag} / {@code suffixTag} 包裹。
|
|
|
+ * 例如 {@code highlight(tenantId, text, "<span class=\"sensitive\">", "</span>")}。
|
|
|
+ *
|
|
|
+ * <p>处理重叠命中:保留较早出现且较长的命中,丢弃被覆盖的命中,避免标签嵌套混乱。
|
|
|
+ */
|
|
|
+ public String highlight(Long tenantId, String text, String prefixTag, String suffixTag) {
|
|
|
+ if (StringUtils.isBlank(text)) {
|
|
|
+ return text;
|
|
|
+ }
|
|
|
+ List<SensitiveWordHit> hits = matchAll(tenantId, text);
|
|
|
+ if (hits.isEmpty()) {
|
|
|
+ return text;
|
|
|
+ }
|
|
|
+ // 按 begin 升序、长度降序排序后做区间合并
|
|
|
+ hits.sort((a, b) -> {
|
|
|
+ if (a.getBegin() != b.getBegin()) {
|
|
|
+ return Integer.compare(a.getBegin(), b.getBegin());
|
|
|
+ }
|
|
|
+ return Integer.compare(b.getEnd() - b.getBegin(), a.getEnd() - a.getBegin());
|
|
|
+ });
|
|
|
+
|
|
|
+ StringBuilder sb = new StringBuilder(text.length() + hits.size() * 16);
|
|
|
+ int cursor = 0;
|
|
|
+ for (SensitiveWordHit hit : hits) {
|
|
|
+ if (hit.getBegin() < cursor) {
|
|
|
+ // 与前一段重叠,跳过
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (hit.getBegin() > cursor) {
|
|
|
+ sb.append(text, cursor, hit.getBegin());
|
|
|
+ }
|
|
|
+ sb.append(prefixTag)
|
|
|
+ .append(text, hit.getBegin(), hit.getEnd())
|
|
|
+ .append(suffixTag);
|
|
|
+ cursor = hit.getEnd();
|
|
|
+ }
|
|
|
+ if (cursor < text.length()) {
|
|
|
+ sb.append(text, cursor, text.length());
|
|
|
+ }
|
|
|
+ return sb.toString();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 把命中部分用 {@code mask} 字符按命中长度等长替换(如 '*')。
|
|
|
+ */
|
|
|
+ public String replace(Long tenantId, String text, char mask) {
|
|
|
+ if (StringUtils.isBlank(text)) {
|
|
|
+ return text;
|
|
|
+ }
|
|
|
+ List<SensitiveWordHit> hits = matchAll(tenantId, text);
|
|
|
+ if (hits.isEmpty()) {
|
|
|
+ return text;
|
|
|
+ }
|
|
|
+ char[] chars = text.toCharArray();
|
|
|
+ for (SensitiveWordHit hit : hits) {
|
|
|
+ for (int i = hit.getBegin(); i < hit.getEnd() && i < chars.length; i++) {
|
|
|
+ chars[i] = mask;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return new String(chars);
|
|
|
+ }
|
|
|
+
|
|
|
+ /** 仅用于 parseText 短路退出的内部异常 */
|
|
|
+ private static final class ShortCircuitException extends RuntimeException {
|
|
|
+ private static final long serialVersionUID = 1L;
|
|
|
+
|
|
|
+ ShortCircuitException() {
|
|
|
+ super(null, null, false, false);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|