import re import zhon from modules.models import get_tokenizer from modules.utils.detect_lang import guess_lang # 解析文本 并根据停止符号分割成句子 # 可以设置最大阈值,即如果分割片段小于这个阈值会与下一段合并 class SentenceSplitter: SEP_TOKEN = " " def __init__(self, threshold=100): assert ( isinstance(threshold, int) and threshold > 0 ), "Threshold must be greater than 0." self.sentence_threshold = threshold self.tokenizer = get_tokenizer() def count_tokens(self, text: str): return len(self.tokenizer.tokenize(text)) def parse(self, text: str): sentences = self.split_paragraph(text) sentences = self.merge_text_by_threshold(sentences) return sentences def merge_text_by_threshold(self, setences: list[str]): """ Merge text by threshold. If the length of the text is less than the threshold, merge it with the next text. """ merged_sentences: list[str] = [] temp_sentence = "" for sentence in setences: if len(temp_sentence) + len(sentence) < self.sentence_threshold: temp_sentence += SentenceSplitter.SEP_TOKEN + sentence else: merged_sentences.append(temp_sentence) temp_sentence = sentence if temp_sentence: merged_sentences.append(temp_sentence) return merged_sentences def split_paragraph(self, text: str): """ Split text into sentences. """ lines = text.split("\n") sentences: list[str] = [] for line in lines: if self.is_eng_sentence(line): sentences.extend(self.split_en_sentence(line)) else: sentences.extend(self.split_zhon_sentence(line)) return sentences def is_eng_sentence(self, text: str): return guess_lang(text) == "en" def split_en_sentence(self, text: str): """ Split English text into sentences. """ pattern = re.compile(r"(?