欢迎光临散文网 会员登陆 & 注册

使用KNN及TF进行中文PDF搜索,类似于AutoGPT或ChatPDF实现原理!

2023-05-05 10:26 作者:civilpy  | 我要投稿

直接上代码,结合该功能与GPT进行搞基:

PDF文本搜索

import osimport reimport shutilimport urllib.requestfrom pathlib import Pathfrom tempfile import NamedTemporaryFileimport fitzimport numpy as npimport openaiimport tensorflow_hub as hubfrom sklearn.neighbors import NearestNeighbors# 对每页PDF进行预处理,生成一个text_listdef preprocess(text):    text = text.replace('\n', ' ')    text = re.sub('\s+', ' ', text)    return textdef pdf_to_text(path, start_page=1, end_page=None):    doc = fitz.open(path)    total_pages = doc.page_count    if end_page is None:        end_page = total_pages    text_list = []    for i in range(start_page - 1, end_page):        text = doc.load_page(i).get_text("text")        text = preprocess(text)        text_list.append(text)    doc.close()    return text_listdef text_to_chunks(texts, word_length=150, start_page=1):    text_toks = [t.split(' ') for t in texts]    page_nums = []    chunks = []    for idx, words in enumerate(text_toks):        for i in range(0, len(words), word_length):            chunk = words[i : i + word_length]            if (                (i + word_length) > len(words)                and (len(chunk) < word_length)                and (len(text_toks) != (idx + 1))            ):                text_toks[idx + 1] = chunk + text_toks[idx + 1]                continue            chunk = ' '.join(chunk).strip()            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'#             print({idx+start_page})            chunks.append(chunk)    return chunksclass SemanticSearch:    def __init__(self):        self.use = hub.load("F:/*******") # 中文 https://www.intumu.com/article/203                self.fitted = False    def fit(self, data, batch=100, n_neighbors=3): # batch=1000, n_neighbors=5        self.data = data        self.embeddings = self.get_text_embedding(data, batch=batch)        n_neighbors = min(n_neighbors, len(self.embeddings))        self.nn = NearestNeighbors(n_neighbors=n_neighbors)        self.nn.fit(self.embeddings)        self.fitted = True    def __call__(self, text, return_data=True):        inp_emb = self.use([text])        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]        if return_data:            return [self.data[i] for i in neighbors]        else:            return neighbors    def get_text_embedding(self, texts, batch=1000):        embeddings = []        for i in range(0, len(texts), batch):            text_batch = texts[i : (i + batch)]            emb_batch = self.use(text_batch)            embeddings.append(emb_batch)        embeddings = np.vstack(embeddings)        return embeddingsdef load_recommender(path, start_page=1):    global recommender    texts = pdf_to_text(path, start_page=start_page)    chunks = text_to_chunks(texts, start_page=start_page)    recommender.fit(chunks)    return 'Corpus Loaded.'# 开始训练语料库pdf_path='第3章  岩土工程勘察.pdf'recommender = SemanticSearch()load_recommender(pdf_path) # 使用fit生成语料库 https://www.intumu.com/article/203question='钻孔深度相关规定?'topn_chunks = recommender(question)print(topn_chunks)

GPT查询代码

def generate_answer(question, openAI_key):    topn_chunks = recommender(question)      prompt = ""    prompt += 'search results:\n\n'    for c in topn_chunks:        prompt += c + '\n\n'    prompt += (        "Instructions: Compose a comprehensive reply to the query using the search results given. "        "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "        "Citation should be done at the end of each sentence. If the search results mention multiple subjects "        "with the same name, create separate answers for each. Only include information found in the results and "        "don't add any additional information. Make sure the answer is correct and don't output false content. "        "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "        "search results which has nothing to do with the question. Only answer what is asked. The "        "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "    )    prompt += f"Query: {question}\nAnswer:"    answer = generate_text(openAI_key, prompt, "text-davinci-003") #     answer = handle_message(prompt)    return answer def generate_text(openAI_key, prompt, engine="text-davinci-003"):    openai.api_key = openAI_key    completions = openai.Completion.create(        engine=engine,        prompt=prompt,        max_tokens=512,        n=1,        stop=None,        temperature=0.7,    )    message = completions.choices[0].text    return message openAI_key = 'sk-zo59kJ9gV7yx8xgsn8jrT3BlbkFJT******' #https://www.intumu.com/article/203 generate_answer(question, openAI_key)

结语

以上类似于AutoGPT或chatPDF的实现原理,感兴趣的读者可以试试。

civilpy:Python数据分析及可视化实例目录913 赞同 · 36 评论文章


使用KNN及TF进行中文PDF搜索,类似于AutoGPT或ChatPDF实现原理!的评论 (共 条)

分享到微博请遵守国家法律