DFA Algorithm Finite Automata, Sensitive Word Filtering, PHP Example, PHP Example

DFA Algorithm Finite Automata, Sensitive Word Filtering, PHP Example, PHP Example;

1. PHP uses Array to implement the HashMap class library,

<?php
/**
 * php uses array to build HashMap structure class
 **/

namespace DFAMaster;

class HashMap
{
    /**
     * Hash table variable
     * @var array|null
     */
    protected $hashTable = array();

    public function __construct(){}

    /**
     * Add a key-value pair to the HashMap
     * @param $key
     * @param $value
     * @return mixed|null
     */
    public function put($key, $value)
    {
        if (! array_key_exists($key, $this->hashTable)) {
            $this->hashTable[$key] = $value;
            return null;
        }
        $_temp = $this->hashTable[$key];
        $this->hashTable[$key] = $value;
        return $_temp;
    }

    /**
     * Get the corresponding value according to the key
     * @param $key
     * @return mixed|null
     */
    public function get($key)
    {
        if (array_key_exists($key, $this->hashTable)) {
            return $this->hashTable[$key];
        }
        return null;
    }

    /**
     * Delete the key-value pair of the specified key
     * @param $key
     * @return mixed|null
     */
    public function remove($key)
    {
        $temp_table = array();
        if (array_key_exists($key, $this->hashTable)) {
            $tempValue = $this->hashTable[$key];
            while ($curValue = current($this->hashTable)) {
                if (! (key($this->hashTable) == $key)) {
                    $temp_table[key($this->hashTable)] = $curValue;
                }
                next($this->hashTable);
            }
            $this->hashTable = null;
            $this->hashTable = $temp_table;
            return $tempValue;
        }
        return null;
    }

    /**
     * Get all key values of HashMap
     * @return array
     */
    public function keys()
    {
        return array_keys($this->hashTable);
    }

    /**
     * Get all the value of HashMap
     * @return array
     */
    public function values()
    {
        return array_values($this->hashTable);
    }

    /**
     * Put all the values of a HashMap into the current HashMap
     * @param \DfaFilter\HashMap $map
     */
    public function putAll($map)
    {
        if (! $map->isEmpty() & amp; & amp; $map->size() > 0) {
            $keys = $map->keys();
            foreach ($keys as $key) {
                $this->put($key, $map->get($key));
            }
        }

        return;
    }

    /**
     * Remove all elements in the HashMap
     * @return bool
     */
    public function removeAll()
    {
        $this->hashTable = null;
        return true;
    }

    /**
     * Determine whether the specified value is contained in the HashMap
     * @param $value
     * @return bool
     */
    public function containsValue($value)
    {
        while ($curValue = current($this->hashTable)) {
            if ($curValue == $value) {
                return true;
            }
            next($this->hashTable);
        }
        return false;
    }

    /**
     * Determine whether the specified key key is contained in the HashMap
     * @param $key
     * @return bool
     */
    public function containsKey($key)
    {
        if (array_key_exists($key, $this->hashTable)) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Get the number of elements in the HashMap
     * @return int
     */
    public function size()
    {
        return count($this->hashTable);
    }

    /**
     * Determine whether the HashMap is empty
     * @return bool
     */
    public function isEmpty()
    {
        return (count($this->hashTable) == 0);
    }
}

2. Encapsulate the concern logic of sensitive words

<?php
/**
 * DFA algorithm, store sensitive words and illegal words in a tree by characters, and realize a limited lexicon of illegal words;
 * Match the starting index position of the detected text word segmentation to facilitate replacement or count the number of illegal words;
 * There should be no inclusion relationship between sensitive words.
 */

namespace DFAMaster;

use Exception;

class Sensitive Words
{
    /**
     * The length of the sentence to be detected
     * @var int
     */
    protected $contentLength = 0;

    /**
     * Sensitive word single case
     * @var object|null
     */
    private static $_instance = null;

    /**
     * Impressive lexicon tree
     * @var HashMap|null
     */
    protected $wordTree = null;

    /**
     * Store the words to be tested
     * @var array|null
     */
    protected static $badWordList = null;

    /**
     * Get singleton
     * @return self
     */
    public static function init()
    {
        if (!self::$_instance instanceof self) {
            self::$_instance = new self();
        }
        return self::$_instance;
    }

    /**
     * @param $str
     * @param null $encoding
     * @return int
     * @throws Exception
     */
    function mb_strlen($str, $encoding = null)
    {
        $length = \mb_strlen($str, $encoding);
        if ($length === false) {
            throw new Exception('encoding is invalid');
        }

        return $length;
    }

    /**
     * Build a tree of inscription words [file mode]
     * @param string $filepath
     * @return $this
     * @throws Exception
     */
    public function setTreeByFile($filepath = '')
    {
        if (!file_exists($filepath)) {
            throw new Exception('The sensitive lexicon file does not exist', 10003);
        }

        // Thesaurus tree initialization
        $this->wordTree = $this->wordTree ?: new HashMap();

        foreach ($this->yieldToReadFile($filepath) as $word) {
            $this->buildWordToTree(trim($word));
        }

        return $this;
    }


    /**
     * Build a tree of emotional words [array mode]
     * @param null $sensitiveWords
     * @return $this
     * @throws Exception
     */
    public function setTree($sensitiveWords = null)
    {
        if (empty($sensitiveWords)) {
            throw new Exception('Sensitive violation lexicon cannot be empty', 10002);
        }

        $this->wordTree = new HashMap();

        foreach ($sensitiveWords as $word) {
            $this->buildWordToTree($word);
        }
        return $this;
    }

    /**
     * Detect sensitive words in text
     * @param string $content content to be detected
     * @param int $matchType match type [default is minimum match rule]
     * @param int $wordNum The number of sensitive words to be obtained [default to obtain all]
     * @return array
     * @throws Exception
     */
    public function getBadWord($content, $matchType = 1, $wordNum = 0)
    {
        $this->contentLength = $this->mb_strlen($content, 'utf-8');
        $badWordList = array();
        for ($length = 0; $length < $this->contentLength; $length ++ ) {
            $matchFlag = 0;
            $flag = false;
            $tempMap = $this->wordTree;
            for ($i = $length; $i < $this->contentLength; $i ++ ) {
                $keyChar = mb_substr($content, $i, 1, 'utf-8');

                // Get the specified node tree
                $nowMap = $tempMap->get($keyChar);

                // There is no node tree, return directly
                if (empty($nowMap)) {
                    break;
                }

                // exists, then judge whether it is the last one
                $tempMap = $nowMap;

                // Find the corresponding key, offset + 1
                $matchFlag++;

                // If it is the last matching rule, end the loop and return the number of matching IDs
                if (false === $nowMap->get('ending')) {
                    continue;
                }

                $flag = true;

                // Minimum rules, exit directly
                if (1 === $matchType) {
                    break;
                }
            }

            if (!$flag) {
                $matchFlag = 0;
            }

            // find the corresponding key
            if ($matchFlag <= 0) {
                continue;
            }

            $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');

            // There is a limit to the number of returns
            if ($wordNum > 0 & amp; & amp; count($badWordList) == $wordNum) {
                return $badWordList;
            }

            // Need to match the content flag to move backward
            $length = $length + $matchFlag - 1;
        }
        return $badWordList;
    }

    /**
     * Replace sensitive word characters
     * @param $content The text content of the word to be filtered
     * @param string $replaceChar replacement character
     * @param bool $repeat true=>Repeat is replaced by characters of the same length as the sensitive word
     * @param int $matchType
     * @return mixed
     * @throws Exception
     * @throws Exception
     */
    public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1)
    {
        if (empty($content)) {
            throw new Exception('Please fill in the detected content', 10001);
        }
        $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
        // Sensitive words are not detected, return directly
        if (empty($badWordList)) {
            return $content;
        }

        foreach ($badWordList as $badWord) {
            $hasReplacedChar = $replaceChar;
            if ($repeat) {
                $hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar);
            }
            $content = str_replace($badWord, $hasReplacedChar, $content);
        }
        return $content;
    }

    /**
     * mark sensitive words
     * @param $content text content
     * @param string $sTag tag start, such as <mark>
     * @param string $eTag end tag, such as </mark>
     * @param int $matchType
     * @return mixed
     * @throws Exception
     * @throws Exception
     */
    public function mark($content, $sTag, $eTag, $matchType = 1)
    {
        if (empty($content)) {
            throw new Exception('Please fill in the detected content', 10001);
        }
        $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
        // Sensitive words are not detected, return directly
        if (empty($badWordList)) {
            return $content;
        }
        $badWordList = array_unique($badWordList);
        foreach ($badWordList as $badWord) {
            $replaceChar = $sTag . $badWord . $eTag;
            $content = str_replace($badWord, $replaceChar, $content);
        }
        return $content;
    }

    /**
     * Whether the detected content is legal
     * @param $content
     * @return bool
     * @throws Exception
     */
    public function islegal($content)
    {
        $this->contentLength = $this->mb_strlen($content, 'utf-8');

        for ($length = 0; $length < $this->contentLength; $length ++ ) {
            $matchFlag = 0;

            $tempMap = $this->wordTree;
            for ($i = $length; $i < $this->contentLength; $i ++ ) {
                $keyChar = mb_substr($content, $i, 1, 'utf-8');

                // Get the specified node tree
                $nowMap = $tempMap->get($keyChar);
                // There is no node tree, return directly
                if (empty($nowMap)) {
                    break;
                }
                // Find the corresponding key, offset + 1
                $tempMap = $nowMap;
                $matchFlag++;
                // If it is the last matching rule, end the loop and return the number of matching IDs
                if (false === $nowMap->get('ending')) {
                    continue;
                }
                return true;
            }
            // find the corresponding key
            if ($matchFlag <= 0) {
                continue;
            }
            // Need to match the content flag to move backward
            $length = $length + $matchFlag - 1;
        }
        return false;
    }

    /**
     * Read sensitive word file
     * @param $filepath
     * @return \Generator
     */
    protected function yieldToReadFile($filepath)
    {
        $fp = fopen($filepath, 'r');
        while (!feof($fp)) {
            yield fgets($fp);
        }
        fclose($fp);
    }

    /**
     * Build a single sensitive word into a tree structure
     * @param $word
     * @return void
     * @throws Exception
     */
    protected function buildWordToTree($word = '')
    {
        if ('' === $word) {
            return;
        }
        $tree = $this->wordTree;

        $wordLength = $this->mb_strlen($word, 'utf-8');
        for ($i = 0; $i < $wordLength; $i ++ ) {
            $keyChar = mb_substr($word, $i, 1, 'utf-8');

            // Get the child node tree structure
            $tempTree = $tree->get($keyChar);

            if ($tempTree) {
                $tree = $tempTree;
            } else {
                // set the flag bit
                $newTree = new HashMap();
                $newTree->put('ending', false);

                // add to collection
                $tree->put($keyChar, $newTree);
                $tree = $newTree;
            }

            // reach the last node
            if ($i == $wordLength - 1) {
                $tree->put('ending', true);
            }
        }

        return;
    }

    /**
     * Sensitive words are replaced with characters of the corresponding length
     * @param $word
     * @param $char
     * @return string
     * @throws Exception
     */
    protected function dfaBadWordConversChars($word, $char)
    {
        $str = '';
        $length = $this->mb_strlen($word, 'utf-8');
        for ($counter = 0; $counter < $length; + + $counter) {
            $str .= $char;
        }

        return $str;
    }
}

3. Test use

<?php

use DFAMaster\SensitiveWords;

class test
{
    protected $wordData;
    protected $content = '';
    protected $wordsPath = '';

    public function __construct($content)
    {
        //recognized content
        $this->content = $content;
        //sensitive words
        $wordPool = 'Accused of plagiarism, the company's responsibility, no code, graduation certificate, certificate, certificate';
        $this->wordData = explode(',', $wordPool);
        // Inscription word file path
        $this->wordsPath = '/data/words.txt';
    }

    /**
     * test
     * @return void
     * @throws Exception
     */
    public function test()
    {
        //filter
        $filterContent = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files
            ->getBadWord($this->content);

        //Return a specified number of sensitive words, used to judge that when there are sensitive words, discard the identified content
        $badWords = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files
            ->getBadWord($this->content, 1, 2);


        // filter replace
        $filterContent = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files
            ->replace($this->content, '*');

        // filter replace
        $filterContent = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files
            ->replace($this->content, '*', true);

        //Filter sensitive words and illegal words mark
        $markedContent = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files
            ->mark($this->content, '<mark>', '</mark>');

    }

}