import { cutForSearch } from "@node-rs/jieba"; import Colors from "colors"; import minisearch from "minisearch"; import sizeof from "object-sizeof"; import { getPostFileContent, sortedPosts } from "./post-process"; // Due to the flaws of the word tokenizer, // it is necessary to match CJKL symbols only // during the word segmentation process to prevent repeated recognition. const NonCJKLRecognizeRegex = /[^\u4e00-\u9fa5\u3040-\u30ff\uac00-\ud7af\u1100-\u11ff\u3130-\u318f\u31c0-\u31ef\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u0041-\u005a\u0061-\u007a\u00c0-\u00ff\u0100-\u017f\u0180-\u024f\s ]/g; function tokenizer(str: string) { const result = cutForSearch(str.replace(NonCJKLRecognizeRegex, " "), true); for (let i = 0; i < result.length; i++) { if (result[i].trim() === "") { result.splice(i, 1); i--; } } return result; } function makeSearchIndex() { const startTime = Date.now(); const miniSearch = new minisearch({ fields: ["id", "title", "tags", "subtitle", "summary", "content"], storeFields: ["id", "title", "tags", "summary"], tokenize: tokenizer, }); for (let index = 0; index < sortedPosts.allPostList.length; index++) { const post = sortedPosts.allPostList[index]; const content = getPostFileContent(post.id); miniSearch.add({ id: post.id, title: post.frontMatter.title, tags: post.frontMatter.tags, subtitle: post.frontMatter.subtitle, summary: post.frontMatter.summary, content: content, }); } const endTime = Date.now(); const sizeofIndex = (sizeof(miniSearch) / 1024 ** 2).toFixed(3); console.log( Colors.cyan( `Search index is ready. And the size of index is ${sizeofIndex} mb. And it costs ${(endTime - startTime) / 1000} s.`, ), ); return miniSearch; } export const SearchIndex = Object.freeze(makeSearchIndex());