How to use Python to implement word segmentation and NER web services

Everyone knows the power of python in data processing, data analysis and machine learning. So how to use Python to implement Web-based HTTP applications and services? In fact, there are many ways, such as using gradient and streamlit to implement web applications, using FastApi implements web services, etc. For details, see the previous large model application article. This article uses the tornado framework to implement http web services, and combines the needs of natural language processing (NLP) word segmentation and entity recognition to implement how to provide corresponding http services. . details as follows:
Operating environment: python3.10, tornado, jieba, time, logging, etc.
Run command: python httpServer_nlp.py
Calling method: http://localhost:8082/cutsegment?content=Study on three aspects of research and implementation of semantic analysis and extraction methods of oil and gas exploration and development documents & amp;search_type=1
Recognition results (word segmentation):{“cut”: [“Oil and gas exploration”, “Development”, “Document”, “Semantics”, “Analysis\ “, “extraction”, “method”, “research”, “three”, “research”], “entities”: [], “returncode”: 0 , “message”: “ok”, “runtime”: 0.3878319263458252}
Recognition results (entities):{“cut”: [], “entities”: [“Method”, “Semantics”, “Analysis”, ” extract”, “implementation”, “documentation”, “aspect”], “returncode”: 0, “message”: “ok”, “runtime”: 0.4005763530731201 }

import sys
import os
import time
import tornado.httpserver
from tornado.options import define, options
import json
from collections import OrderedDict
import re
import jieba
from jieba import analyze
import logging
# Set up logging
logger = logging.getLogger() # Get the logger
logger.setLevel(logging.INFO) #Set the global log output level
# Create a file logging processor and specify some setting options
fileHandler = logging.FileHandler(filename=f'./log/service.log', mode='a + ', encoding='utf-8', delay=False)
# Define log output style (formatter)
format_option = '%(asctime)s - %(filename)s[line:%(lineno)d] - %(threadName)s - %(levelname)s: %(message)s'
fileHandler.setFormatter(logging.Formatter(format_option))
# Add the logging processor to the log object
logger.addHandler(fileHandler)
# logger.info("result:{}".format(output)) # Local logging example
  
# Set port
define("port", default=8082, help="--port", type=int)
# Get word segmentation results
def get_kg_result_0(text):
    jieba.load_userdict("./data/StopWord/user_dict.txt") # Load a custom word segmentation dictionary
    # 2 Get stop words
    stwlist = get_stop_words()
    text,theDate=get_date(text)
    # 3 Split words and remove stop words
    out_dict = remove_special_tokens(jieba.cut(text, cut_all=False), stwlist)
    print('\\
\\
1. Load custom word segmentation dictionary: \\
' + "/ ".join(out_dict))
    return out_dict + theDate
# Use TF-IDF to obtain entities
def get_entity_0(text):
    # Extract the first few keywords and return them according to the weight value from high to low.
    print('Extract the first number of keywords' + '//')
    textrank = analyze.textrank
    keywords2 = textrank(text)
    print(keywords2)
    return keywords2
# Read stop words
def get_stop_words(path=r'./data/StopWord/NLPIR_stopwords.txt'):
    file = open(path, 'r',encoding='utf-8').read().split('\\
')
    return set(file)
# Remove some stop words/single characters/empty characters, etc.
def remove_special_tokens(words, stwlist):
    words_list = list(words)
    stop_words = stwlist
    for i in range(words_list.__len__())[::-1]:
        if words_list[i] in stop_words: # Remove stop words
            words_list.pop(i)
        elif (len(words_list[i]) == 1): # Remove a single character
            words_list.pop(i)
        elif words_list[i] == " ": # Remove empty characters
            words_list.pop(i)
    return words_list
  
# Get the date as a whole, the date needs to be processed separately before word segmentation
def get_date(content):
    pattern = r'\d{4}-\d{1,2}-\d{1,2}|\d{4}year\d{1,2}month\d{ 1,2}day|\d{4}/\d{1,2}/\d{1,2}'
    result = re.findall(pattern, content)
    for item in result:
        content = content.replace(item, "祥") # Convert date into special characters, usually replaced by uncommon Chinese characters
    return content,result
# tornado structure, you only need to add code inside and standardize the output format.
class MainGetHandler(tornado.web.RequestHandler):
    def recog(self, mode="get"):
        """ Can support both get and post requests """ if mode == "get":
            sub = self.get_argument("content", None)
            search_type = self.get_argument("search_type", 0)
            search_type = int(search_type)
            uid = self.get_argument("uuid", "000000")
        else:
            """ post method receives parameters passed by data """ data = json.loads(self.request.body.decode())
            sub = data["content"] if "content" in data else None
            search_type = int(data["search_type"]) if "search_type" in data else 0
            uid = data["uuid"] if "uuid" in data else "000000"
  
        #### Configuration parameters #### result = OrderedDict()
        returncode = 0
        message = "ok"
        output = {}
        entity={}
        start = time.time()
  
        if search_type == 0 or search_type > 4:
            returncode = 10000
            message = "search_type is error"
        if sub is None and rel is None and obj is None:
            returncode = 10001
            message = "data is null"
        if search_type == 1: # Check all relations and objects of content, content cannot be empty
            try:
                if sub == None or sub in [""," "]:
                    returncode = 10002
                    message = "when search_type is 1, content not null"
                else:
                    output = get_kg_result_0(sub)
                    entity=[]
            except Exception as e:
                logger.info("{},error: {}".format(output))
                returncode = 10002
                message = "service error"
        elif search_type == 2:
            try:
                if sub == None or sub in [""," "]:
                    returncode = 10003
                    message = "when search_type is 2, content and rel not null"
                else:
                    output = []
                    entity=get_entity_0(sub)
            except Exception as e:
                logger.info("{},error: {}".format(entity))
                returncode = 10003
                message = "service error"
        end = time.time()
        detal=end-start
        # Output in json format, refer to the output format
        result["cut"] = output
        result["entities"]=entity #Entity recognition
        result["returncode"] = returncode
        result["message"] = message
        result["runtime"] = detal
        logger.info("result:{}".format(result)) # Local log
        self.write(json.dumps(result, ensure_ascii=False)) # Write results
        self.finish()
    def get(self):
        """ get mode call """ self.recog(mode="get")
    def post(self):
        """ post mode call """ self.recog(mode="post")
# Main program
if __name__ == "__main__":
    # """ Server startup """
    print("Server is listening, Port:" + str(options.port) + " ...")
    sys.path.append("../") # Load the current directory into path
    tornado.options.parse_command_line()
    # Domain name rules need to be consistent with those configured in nginx
    application = tornado.web.Application([(r"/cutsegment", MainGetHandler)])
    http_server = tornado.httpserver.HTTPServer(application)
    http_server.listen(options.port)
    tornado.ioloop.IOLoop.instance().start()

The knowledge points of the article match the official knowledge files, and you can further learn related knowledge. Network Skill TreeHomepageOverview 42259 people are learning the system