DFA Algorithm Finite Automata, Sensitive Word Filtering, PHP Example, PHP Example;
1. PHP uses Array to implement the HashMap class library,
<?php /** * php uses array to build HashMap structure class **/ namespace DFAMaster; class HashMap { /** * Hash table variable * @var array|null */ protected $hashTable = array(); public function __construct(){} /** * Add a key-value pair to the HashMap * @param $key * @param $value * @return mixed|null */ public function put($key, $value) { if (! array_key_exists($key, $this->hashTable)) { $this->hashTable[$key] = $value; return null; } $_temp = $this->hashTable[$key]; $this->hashTable[$key] = $value; return $_temp; } /** * Get the corresponding value according to the key * @param $key * @return mixed|null */ public function get($key) { if (array_key_exists($key, $this->hashTable)) { return $this->hashTable[$key]; } return null; } /** * Delete the key-value pair of the specified key * @param $key * @return mixed|null */ public function remove($key) { $temp_table = array(); if (array_key_exists($key, $this->hashTable)) { $tempValue = $this->hashTable[$key]; while ($curValue = current($this->hashTable)) { if (! (key($this->hashTable) == $key)) { $temp_table[key($this->hashTable)] = $curValue; } next($this->hashTable); } $this->hashTable = null; $this->hashTable = $temp_table; return $tempValue; } return null; } /** * Get all key values of HashMap * @return array */ public function keys() { return array_keys($this->hashTable); } /** * Get all the value of HashMap * @return array */ public function values() { return array_values($this->hashTable); } /** * Put all the values of a HashMap into the current HashMap * @param \DfaFilter\HashMap $map */ public function putAll($map) { if (! $map->isEmpty() & amp; & amp; $map->size() > 0) { $keys = $map->keys(); foreach ($keys as $key) { $this->put($key, $map->get($key)); } } return; } /** * Remove all elements in the HashMap * @return bool */ public function removeAll() { $this->hashTable = null; return true; } /** * Determine whether the specified value is contained in the HashMap * @param $value * @return bool */ public function containsValue($value) { while ($curValue = current($this->hashTable)) { if ($curValue == $value) { return true; } next($this->hashTable); } return false; } /** * Determine whether the specified key key is contained in the HashMap * @param $key * @return bool */ public function containsKey($key) { if (array_key_exists($key, $this->hashTable)) { return true; } else { return false; } } /** * Get the number of elements in the HashMap * @return int */ public function size() { return count($this->hashTable); } /** * Determine whether the HashMap is empty * @return bool */ public function isEmpty() { return (count($this->hashTable) == 0); } }
2. Encapsulate the concern logic of sensitive words
<?php /** * DFA algorithm, store sensitive words and illegal words in a tree by characters, and realize a limited lexicon of illegal words; * Match the starting index position of the detected text word segmentation to facilitate replacement or count the number of illegal words; * There should be no inclusion relationship between sensitive words. */ namespace DFAMaster; use Exception; class Sensitive Words { /** * The length of the sentence to be detected * @var int */ protected $contentLength = 0; /** * Sensitive word single case * @var object|null */ private static $_instance = null; /** * Impressive lexicon tree * @var HashMap|null */ protected $wordTree = null; /** * Store the words to be tested * @var array|null */ protected static $badWordList = null; /** * Get singleton * @return self */ public static function init() { if (!self::$_instance instanceof self) { self::$_instance = new self(); } return self::$_instance; } /** * @param $str * @param null $encoding * @return int * @throws Exception */ function mb_strlen($str, $encoding = null) { $length = \mb_strlen($str, $encoding); if ($length === false) { throw new Exception('encoding is invalid'); } return $length; } /** * Build a tree of inscription words [file mode] * @param string $filepath * @return $this * @throws Exception */ public function setTreeByFile($filepath = '') { if (!file_exists($filepath)) { throw new Exception('The sensitive lexicon file does not exist', 10003); } // Thesaurus tree initialization $this->wordTree = $this->wordTree ?: new HashMap(); foreach ($this->yieldToReadFile($filepath) as $word) { $this->buildWordToTree(trim($word)); } return $this; } /** * Build a tree of emotional words [array mode] * @param null $sensitiveWords * @return $this * @throws Exception */ public function setTree($sensitiveWords = null) { if (empty($sensitiveWords)) { throw new Exception('Sensitive violation lexicon cannot be empty', 10002); } $this->wordTree = new HashMap(); foreach ($sensitiveWords as $word) { $this->buildWordToTree($word); } return $this; } /** * Detect sensitive words in text * @param string $content content to be detected * @param int $matchType match type [default is minimum match rule] * @param int $wordNum The number of sensitive words to be obtained [default to obtain all] * @return array * @throws Exception */ public function getBadWord($content, $matchType = 1, $wordNum = 0) { $this->contentLength = $this->mb_strlen($content, 'utf-8'); $badWordList = array(); for ($length = 0; $length < $this->contentLength; $length ++ ) { $matchFlag = 0; $flag = false; $tempMap = $this->wordTree; for ($i = $length; $i < $this->contentLength; $i ++ ) { $keyChar = mb_substr($content, $i, 1, 'utf-8'); // Get the specified node tree $nowMap = $tempMap->get($keyChar); // There is no node tree, return directly if (empty($nowMap)) { break; } // exists, then judge whether it is the last one $tempMap = $nowMap; // Find the corresponding key, offset + 1 $matchFlag++; // If it is the last matching rule, end the loop and return the number of matching IDs if (false === $nowMap->get('ending')) { continue; } $flag = true; // Minimum rules, exit directly if (1 === $matchType) { break; } } if (!$flag) { $matchFlag = 0; } // find the corresponding key if ($matchFlag <= 0) { continue; } $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8'); // There is a limit to the number of returns if ($wordNum > 0 & amp; & amp; count($badWordList) == $wordNum) { return $badWordList; } // Need to match the content flag to move backward $length = $length + $matchFlag - 1; } return $badWordList; } /** * Replace sensitive word characters * @param $content The text content of the word to be filtered * @param string $replaceChar replacement character * @param bool $repeat true=>Repeat is replaced by characters of the same length as the sensitive word * @param int $matchType * @return mixed * @throws Exception * @throws Exception */ public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1) { if (empty($content)) { throw new Exception('Please fill in the detected content', 10001); } $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType); // Sensitive words are not detected, return directly if (empty($badWordList)) { return $content; } foreach ($badWordList as $badWord) { $hasReplacedChar = $replaceChar; if ($repeat) { $hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar); } $content = str_replace($badWord, $hasReplacedChar, $content); } return $content; } /** * mark sensitive words * @param $content text content * @param string $sTag tag start, such as <mark> * @param string $eTag end tag, such as </mark> * @param int $matchType * @return mixed * @throws Exception * @throws Exception */ public function mark($content, $sTag, $eTag, $matchType = 1) { if (empty($content)) { throw new Exception('Please fill in the detected content', 10001); } $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType); // Sensitive words are not detected, return directly if (empty($badWordList)) { return $content; } $badWordList = array_unique($badWordList); foreach ($badWordList as $badWord) { $replaceChar = $sTag . $badWord . $eTag; $content = str_replace($badWord, $replaceChar, $content); } return $content; } /** * Whether the detected content is legal * @param $content * @return bool * @throws Exception */ public function islegal($content) { $this->contentLength = $this->mb_strlen($content, 'utf-8'); for ($length = 0; $length < $this->contentLength; $length ++ ) { $matchFlag = 0; $tempMap = $this->wordTree; for ($i = $length; $i < $this->contentLength; $i ++ ) { $keyChar = mb_substr($content, $i, 1, 'utf-8'); // Get the specified node tree $nowMap = $tempMap->get($keyChar); // There is no node tree, return directly if (empty($nowMap)) { break; } // Find the corresponding key, offset + 1 $tempMap = $nowMap; $matchFlag++; // If it is the last matching rule, end the loop and return the number of matching IDs if (false === $nowMap->get('ending')) { continue; } return true; } // find the corresponding key if ($matchFlag <= 0) { continue; } // Need to match the content flag to move backward $length = $length + $matchFlag - 1; } return false; } /** * Read sensitive word file * @param $filepath * @return \Generator */ protected function yieldToReadFile($filepath) { $fp = fopen($filepath, 'r'); while (!feof($fp)) { yield fgets($fp); } fclose($fp); } /** * Build a single sensitive word into a tree structure * @param $word * @return void * @throws Exception */ protected function buildWordToTree($word = '') { if ('' === $word) { return; } $tree = $this->wordTree; $wordLength = $this->mb_strlen($word, 'utf-8'); for ($i = 0; $i < $wordLength; $i ++ ) { $keyChar = mb_substr($word, $i, 1, 'utf-8'); // Get the child node tree structure $tempTree = $tree->get($keyChar); if ($tempTree) { $tree = $tempTree; } else { // set the flag bit $newTree = new HashMap(); $newTree->put('ending', false); // add to collection $tree->put($keyChar, $newTree); $tree = $newTree; } // reach the last node if ($i == $wordLength - 1) { $tree->put('ending', true); } } return; } /** * Sensitive words are replaced with characters of the corresponding length * @param $word * @param $char * @return string * @throws Exception */ protected function dfaBadWordConversChars($word, $char) { $str = ''; $length = $this->mb_strlen($word, 'utf-8'); for ($counter = 0; $counter < $length; + + $counter) { $str .= $char; } return $str; } }
3. Test use
<?php use DFAMaster\SensitiveWords; class test { protected $wordData; protected $content = ''; protected $wordsPath = ''; public function __construct($content) { //recognized content $this->content = $content; //sensitive words $wordPool = 'Accused of plagiarism, the company's responsibility, no code, graduation certificate, certificate, certificate'; $this->wordData = explode(',', $wordPool); // Inscription word file path $this->wordsPath = '/data/words.txt'; } /** * test * @return void * @throws Exception */ public function test() { //filter $filterContent = SensitiveWords::init() ->setTree($this->wordData) //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files ->getBadWord($this->content); //Return a specified number of sensitive words, used to judge that when there are sensitive words, discard the identified content $badWords = SensitiveWords::init() ->setTree($this->wordData) //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files ->getBadWord($this->content, 1, 2); // filter replace $filterContent = SensitiveWords::init() ->setTree($this->wordData) //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files ->replace($this->content, '*'); // filter replace $filterContent = SensitiveWords::init() ->setTree($this->wordData) //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files ->replace($this->content, '*', true); //Filter sensitive words and illegal words mark $markedContent = SensitiveWords::init() ->setTree($this->wordData) //->setTreeByFile($this->wordsPath) //Load and filter thesaurus through files ->mark($this->content, '<mark>', '</mark>'); } }