|
|
@@ -35,8 +35,17 @@ import java.util.Map;
|
|
|
@Service
|
|
|
public class FastgptChatQuestionCollectServiceImpl {
|
|
|
|
|
|
- private static final int SIMHASH_THRESHOLD = 14;
|
|
|
- private static final double JACCARD_THRESHOLD = 0.55d;
|
|
|
+ /**
|
|
|
+ * 本地匹配阈值(越大越松:允许更大的 SimHash 汉明距离)
|
|
|
+ */
|
|
|
+ private static final int SIMHASH_THRESHOLD = 30;
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 本地 Jaccard 阈值(越小越松)
|
|
|
+ */
|
|
|
+ private static final double JACCARD_THRESHOLD = 0.45d;
|
|
|
+
|
|
|
+ private static final int LOCAL_CANDIDATE_LIMIT = 30;
|
|
|
|
|
|
private static final String MODEL_TYPE_HIGH_FREQ = "高频问题类别";
|
|
|
|
|
|
@@ -77,31 +86,22 @@ public class FastgptChatQuestionCollectServiceImpl {
|
|
|
Date now = DateUtils.getNowDate();
|
|
|
long sh = FastgptQuestionNormalizeUtil.simhash64(display);
|
|
|
|
|
|
- // 本地匹配已有统计,获取数据id
|
|
|
- Long statId = tryMergeByLocalTextMatch(display, sh, now);
|
|
|
+ // ① 先本地匹配:命中则频次+1,并且才会继续走 AI;未命中则不走 AI
|
|
|
+ LocalMatch localMatch = findBestLocalMatch(display, sh);
|
|
|
+ Long statId = null;
|
|
|
+ if (localMatch != null) {
|
|
|
+ statId = localMatch.statId;
|
|
|
+ fastgptChatQuestionStatisticsMapper.incrementFrequencyById(statId, now);
|
|
|
|
|
|
- if (statId == null) {
|
|
|
- // 如果没有匹配,就调用 AI
|
|
|
+ // ② 只有本地命中时,才调用 AI 做类别归类(用于给统计行补齐/修正 question_category)
|
|
|
Integer aiCategory = getHighFreqCategoryByAi(param);
|
|
|
if (aiCategory != null) {
|
|
|
- FastgptChatQuestionStatistics questionStatistics =
|
|
|
- fastgptChatQuestionStatisticsMapper.selectFirstByQuestionCategory(aiCategory);
|
|
|
- if (questionStatistics != null && questionStatistics.getId() != null) {
|
|
|
- statId = questionStatistics.getId();
|
|
|
- fastgptChatQuestionStatisticsMapper.incrementFrequencyById(statId, now);
|
|
|
- } else {
|
|
|
- FastgptChatQuestionStatistics row = new FastgptChatQuestionStatistics();
|
|
|
- row.setQuestionCategory(aiCategory);
|
|
|
- row.setContentSummary(display.length() > 200 ? display.substring(0, 200) : display);
|
|
|
- row.setSimhash(sh);
|
|
|
- row.setIsResolve(0);
|
|
|
- row.setQuestionId(detailId);
|
|
|
- row.setFrequency(1);
|
|
|
- row.setCreateTime(now);
|
|
|
- row.setUpdateTime(now);
|
|
|
- fastgptChatQuestionStatisticsMapper.insertFastgptChatQuestionStatistics(row);
|
|
|
- statId = row.getId();
|
|
|
- }
|
|
|
+ FastgptChatQuestionStatistics upd = new FastgptChatQuestionStatistics();
|
|
|
+ upd.setId(statId);
|
|
|
+ upd.setQuestionCategory(aiCategory);
|
|
|
+ upd.setUpdateTime(now);
|
|
|
+ // 只更新类别(不覆盖其他字段)
|
|
|
+ fastgptChatQuestionStatisticsMapper.updateFastgptChatQuestionStatistics(upd);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -207,29 +207,45 @@ public class FastgptChatQuestionCollectServiceImpl {
|
|
|
/**
|
|
|
* 本地 SimHash + Jaccard 与已有统计行比对,命中则频次+1 并返回该统计 id;不插入新行。
|
|
|
*/
|
|
|
- private Long tryMergeByLocalTextMatch(String display, long sh, Date now) {
|
|
|
- FastgptChatQuestionStatistics best = fastgptChatQuestionStatisticsMapper.selectBestMatchBySimhash(sh, SIMHASH_THRESHOLD);
|
|
|
- if (best != null && best.getId() != null) {
|
|
|
- double jac = FastgptQuestionNormalizeUtil.jaccard(
|
|
|
- FastgptQuestionNormalizeUtil.ngramTokens(display),
|
|
|
- FastgptQuestionNormalizeUtil.ngramTokens(best.getContentSummary())
|
|
|
- );
|
|
|
+ private LocalMatch findBestLocalMatch(String display, long sh) {
|
|
|
+ // 取一批候选,Java 侧算 Jaccard 再挑最优
|
|
|
+ java.util.List<FastgptChatQuestionStatistics> candidates =
|
|
|
+ fastgptChatQuestionStatisticsMapper.selectCandidatesBySimhash(sh, SIMHASH_THRESHOLD, LOCAL_CANDIDATE_LIMIT);
|
|
|
+ if (candidates == null || candidates.isEmpty()) {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ java.util.Set<String> a = FastgptQuestionNormalizeUtil.ngramTokens(display);
|
|
|
+ LocalMatch best = null;
|
|
|
+ for (FastgptChatQuestionStatistics c : candidates) {
|
|
|
+ if (c == null || c.getId() == null || StringUtils.isBlank(c.getContentSummary())) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ double jac = FastgptQuestionNormalizeUtil.jaccard(a, FastgptQuestionNormalizeUtil.ngramTokens(c.getContentSummary()));
|
|
|
if (jac < JACCARD_THRESHOLD) {
|
|
|
- best = null;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // dist 列未映射到实体,这里拿不到;所以只用 jac + id 做选择
|
|
|
+ if (best == null) {
|
|
|
+ best = new LocalMatch(c.getId(), jac);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // 选择“匹配值更大”的(这里用 Jaccard 作为匹配分数)
|
|
|
+ if (jac > best.jaccard) {
|
|
|
+ best = new LocalMatch(c.getId(), jac);
|
|
|
+ } else if (Double.compare(jac, best.jaccard) == 0 && c.getId() > best.statId) {
|
|
|
+ // 分数相同取最新(id 更大)
|
|
|
+ best = new LocalMatch(c.getId(), jac);
|
|
|
}
|
|
|
}
|
|
|
- if (best != null && best.getId() != null) {
|
|
|
- fastgptChatQuestionStatisticsMapper.incrementFrequencyById(best.getId(), now);
|
|
|
- return best.getId();
|
|
|
- }
|
|
|
- return null;
|
|
|
+ return best;
|
|
|
}
|
|
|
|
|
|
/** AI 也未归类时:再按 SimHash 找一遍(与本地逻辑一致),仍无则插入 question_category=0 的新统计行 */
|
|
|
private Long mergeBySimhashFallback(String display, long sh, Long detailId, Date now) {
|
|
|
- Long localId = tryMergeByLocalTextMatch(display, sh, now);
|
|
|
- if (localId != null) {
|
|
|
- return localId;
|
|
|
+ LocalMatch localMatch = findBestLocalMatch(display, sh);
|
|
|
+ if (localMatch != null) {
|
|
|
+ fastgptChatQuestionStatisticsMapper.incrementFrequencyById(localMatch.statId, now);
|
|
|
+ return localMatch.statId;
|
|
|
}
|
|
|
FastgptChatQuestionStatistics row = new FastgptChatQuestionStatistics();
|
|
|
row.setQuestionCategory(0);
|
|
|
@@ -249,6 +265,16 @@ public class FastgptChatQuestionCollectServiceImpl {
|
|
|
return statId;
|
|
|
}
|
|
|
|
|
|
+ private static class LocalMatch {
|
|
|
+ private final Long statId;
|
|
|
+ private final double jaccard;
|
|
|
+
|
|
|
+ private LocalMatch(Long statId, double jaccard) {
|
|
|
+ this.statId = statId;
|
|
|
+ this.jaccard = jaccard;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
private static FastgptChatQuestion buildQuestion(FastgptKnowledgeMissCollectParam param, String userContent) {
|
|
|
FastgptChatQuestion q = new FastgptChatQuestion();
|
|
|
BeanCopyUtils.copy(param, q);
|