This article first preprocesses the corpus in the ten categories of news text corpus, and then loads the preprocessed corpus into solr through the pysolr interface, and implements general retrieval functions through solr. (Here you need to configure the solr service first and start it)
- dataProcessing.py
This part implements different regularized matching of news titles and contents of different layouts.
import os import re from detect_text_formatting import detect_text_formatting dataset_path="./Text classification corpus" def dataProcessing(dataset_path): TXT_container = [] pattern = r'^[\一-\?] + ' pattern1 = r"【Date】" pattern2 = r"Sun and Moon Guanghua --" pattern3 = r".*?社.*?\d + 日电" pattern4 = r"Number of views: (\d + )" patterns = {<!-- --> pattern1: "pattern1", pattern2: "pattern2", pattern3: "pattern3", pattern4: "pattern4", } count_NO = 0 list_NO = [] # print(re.match(pattern,"./Text Classification Corpus\Traffic 214\4190.txt".split('')[1])) for root, dirs, files in os.walk(dataset_path): # print(root) for file in files: if file.endswith('.txt') or file.endswith('.TXT'): container = {<!-- -->} file_path = os.path.join(root, file) # print(file_path) with open(file_path, 'r', encoding='gbk', errors="ignore") as f: file_content = f.read() name = detect_text_formatting(patterns, file_content) # print(name) if name == "pattern1": title_start = '[title]' title_end = '\ ' text_start = '[Text]\ ' text = file_content.split(text_start)[-1] title = file_content.split(title_start)[-1].split(title_end)[0] container["cata"] = re.match(pattern, file_path.split('')[1]).group() container["title"] = title container["text"] = text TXT_container.append(container) elif name == "pattern2": match_obj_title = re.search(r'title:(.*)\ ', file_content) match_obj_TEXT = re.search(r'Sending station:(.*)\ ', file_content) container["cata"] = re.match(pattern, file_path.split('')[1]).group() if match_obj_TEXT: result = file_content.split(match_obj_TEXT.group(1))[-1] container["text"] = result else: container["text"] = "No match found" if match_obj_title: result = match_obj_title.group(1) container["title"] = result else: container["title"] = "No match found" TXT_container.append(container) elif name == "pattern3": container["cata"] = re.match(pattern, file_path.split('')[1]).group() match_obj = file_content.split(re.search(r'\ (.*)电((.*))', file_content).group()) if match_obj: container["title"] = match_obj[0].strip() #Extract text container["text"] = "".join(match_obj[1:]) TXT_container.append(container) elif name == "pattern4": container["cata"] = re.match(pattern, file_path.split('')[1]).group() if re.search(r'Number of views: (\d + )', file_content): container["title"] = file_content.split(re.search(r'Number of views: (\d + )', file_content).group())[0].strip() container["text"] = "".join( file_content.split(re.search(r'Number of views: (\d + )', file_content).group())[1:]) else: container["title"] = "No match found" container["text"] = "No match found" TXT_container.append(container) else: container["cata"] = re.match(pattern, file_path.split('')[1]).group() container["title"] = re.match(pattern, file_path.split('')[1]).group().join("-").join(file) container["text"] = file_content list_NO.append(file_path) count_NO + = 1 TXT_container.append(container) continue return TXT_container
- detect_text_formatting.py
This part implements the detection of news layout format
def detect_text_formatting(patterns,str): for pattern,name in patterns.items(): if re.search(pattern,str): return name return "NO"
- solr_client.py
This part is a solr class object
from pysolr import Solr class SolrClient: def __init__(self, solr_url='http://127.0.0.1:8983/solr/newsCore',always_commit=True): self.solr = Solr(solr_url,always_commit=always_commit,) def add_document(self, documents): self.solr.add(documents) def delete_document(self, document_id): self.solr.delete(id=document_id) def update_document(self, document): self.solr.add([document]) def search_documents(self, query): results = self.solr.search(query, rows=10) return results.docs def clear_all_documents(self): self.solr.delete(q='*:*')
- retrievalPage.py
This part implements retrieval
from solr_client import SolrClient from dataProcessing import dataProcessing dataset_path="./Text classification corpus" c=dataProcessing(dataset_path) solr_client = SolrClient() #Add document #solr_client.add_document(c) print("="*10,"News Query System","="*10) while 1: print("Dear Mr. Abalone, please enter the query:") q=input() query = "title:" + q results= solr_client.search_documents(query) for result in results: print(result["title"][0])