[news retrieval system: news retrieval based on pysolr]

This article first preprocesses the corpus in the ten categories of news text corpus, and then loads the preprocessed corpus into solr through the pysolr interface, and implements general retrieval functions through solr. (Here you need to configure the solr service first and start it)

  1. dataProcessing.py
    This part implements different regularized matching of news titles and contents of different layouts.
import os
import re
from detect_text_formatting import detect_text_formatting
dataset_path="./Text classification corpus"


def dataProcessing(dataset_path):
    TXT_container = []
    pattern = r'^[\一-\?] + '
    pattern1 = r"【Date】"
    pattern2 = r"Sun and Moon Guanghua --"
    pattern3 = r".*?社.*?\d + 日电"
    pattern4 = r"Number of views: (\d + )"
    patterns = {<!-- -->
        pattern1: "pattern1",
        pattern2: "pattern2",
        pattern3: "pattern3",
        pattern4: "pattern4",
    }
    count_NO = 0
    list_NO = []
    # print(re.match(pattern,"./Text Classification Corpus\Traffic 214\4190.txt".split('')[1]))
    for root, dirs, files in os.walk(dataset_path):
        # print(root)
        for file in files:
            if file.endswith('.txt') or file.endswith('.TXT'):
                container = {<!-- -->}
                file_path = os.path.join(root, file)
                # print(file_path)
                with open(file_path, 'r', encoding='gbk', errors="ignore") as f:
                    file_content = f.read()
                name = detect_text_formatting(patterns, file_content)
                # print(name)
                if name == "pattern1":
                    title_start = '[title]'
                    title_end = '\
'
                    text_start = '[Text]\
'
                    text = file_content.split(text_start)[-1]
                    title = file_content.split(title_start)[-1].split(title_end)[0]
                    container["cata"] = re.match(pattern, file_path.split('')[1]).group()
                    container["title"] = title
                    container["text"] = text
                    TXT_container.append(container)
                elif name == "pattern2":
                    match_obj_title = re.search(r'title:(.*)\
', file_content)
                    match_obj_TEXT = re.search(r'Sending station:(.*)\
', file_content)
                    container["cata"] = re.match(pattern, file_path.split('')[1]).group()
                    if match_obj_TEXT:
                        result = file_content.split(match_obj_TEXT.group(1))[-1]
                        container["text"] = result
                    else:
                        container["text"] = "No match found"
                    if match_obj_title:
                        result = match_obj_title.group(1)
                        container["title"] = result
                    else:
                        container["title"] = "No match found"
                    TXT_container.append(container)
                elif name == "pattern3":
                    container["cata"] = re.match(pattern, file_path.split('')[1]).group()
                    match_obj = file_content.split(re.search(r'\
(.*)电((.*))', file_content).group())
                    if match_obj:
                        container["title"] = match_obj[0].strip()
                        #Extract text
                        container["text"] = "".join(match_obj[1:])
                    TXT_container.append(container)
                elif name == "pattern4":
                    container["cata"] = re.match(pattern, file_path.split('')[1]).group()
                    if re.search(r'Number of views: (\d + )', file_content):
                        container["title"] = file_content.split(re.search(r'Number of views: (\d + )', file_content).group())[0].strip()
                        container["text"] = "".join(
                            file_content.split(re.search(r'Number of views: (\d + )', file_content).group())[1:])
                    else:
                        container["title"] = "No match found"
                        container["text"] = "No match found"
                    TXT_container.append(container)
                else:
                    container["cata"] = re.match(pattern, file_path.split('')[1]).group()
                    container["title"] = re.match(pattern, file_path.split('')[1]).group().join("-").join(file)
                    container["text"] = file_content
                    list_NO.append(file_path)
                    count_NO + = 1
                    TXT_container.append(container)
                    continue
    return TXT_container
  1. detect_text_formatting.py
    This part implements the detection of news layout format
def detect_text_formatting(patterns,str):
    for pattern,name in patterns.items():
        if re.search(pattern,str):
            return name
    return "NO"
  1. solr_client.py
    This part is a solr class object
from pysolr import Solr


class SolrClient:
    def __init__(self, solr_url='http://127.0.0.1:8983/solr/newsCore',always_commit=True):
        self.solr = Solr(solr_url,always_commit=always_commit,)

    def add_document(self, documents):
        self.solr.add(documents)

    def delete_document(self, document_id):
        self.solr.delete(id=document_id)

    def update_document(self, document):
        self.solr.add([document])

    def search_documents(self, query):
        results = self.solr.search(query, rows=10)
        return results.docs

    def clear_all_documents(self):
        self.solr.delete(q='*:*')
  1. retrievalPage.py
    This part implements retrieval
from solr_client import SolrClient

from dataProcessing import dataProcessing
dataset_path="./Text classification corpus"
c=dataProcessing(dataset_path)
solr_client = SolrClient()

#Add document
#solr_client.add_document(c)


print("="*10,"News Query System","="*10)
while 1:
    print("Dear Mr. Abalone, please enter the query:")
    q=input()
    query = "title:" + q
    results= solr_client.search_documents(query)
    for result in results:
        print(result["title"][0])