Improve recognition accuracy of searches

2024-01-08 21:25:34 +08:00
parent 7ad86641ee
commit f91f643209
1 changed files with 7 additions and 4 deletions
@@ -2,8 +2,14 @@ import { cutForSearch } from "@node-rs/jieba";
 import minisearch from "minisearch";
 import { getPostFileContent, sortedPosts } from "./post-process";

+// Due to the flaws of the word tokenizer,
+// it is necessary to match CJKL symbols only
+// during the word segmentation process to prevent repeated recognition.
+const CJKLRecognizeRegex = /[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7A3a-zA-Z]+/g;
+
 function tokenizer(str: string) {
-  return cutForSearch(str, true);
+  const result = cutForSearch(str, true).filter((item) => item.match(CJKLRecognizeRegex));
+  return result;
 }

 function makeSearchIndex() {
@@ -11,9 +17,6 @@ function makeSearchIndex() {
    fields: ["id", "title", "tags", "subtitle", "summary", "content"],
    storeFields: ["id", "title", "tags"],
    tokenize: tokenizer,
-    searchOptions: {
-      fuzzy: 0.1,
-    },
  });
  for (let index = 0; index < sortedPosts.allPostList.length; index++) {
    const post = sortedPosts.allPostList[index];