iapt-platform
/
mint
зеркало из https://github.com/iapt-platform/mint.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
							<?php
namespace App\Tools;

use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Log;
use App\Models\UserDict;
use App\Models\WordIndex;


class CaseMan
{
	/**
     * Create a new class instance.
     *
     * @return void
     */
    public function __construct()
    {
        return;
    }

    	/**
     * 从词干到单词的变化
     *
     * @return void
     */
	public function Declension($base,$type=null,$grammar='',$confidence=0.5){
        $newWord = array();
        $case = new CaseEnding();
        foreach ($case->ending as  $ending) {
            # code...
            if($ending[4]<$confidence){
                continue;
            }

            switch ($type) {
                case '.n:base.':
                    if($ending[2] !== '.n.' || strpos($ending[3],$grammar)!==0){continue 2;}
                    break;
                case '.ti:base.':
                    if($ending[2] !== '.ti.' && $ending[2] !== '.n.' ){continue 2;}
                    break;
                case '.adj:base.':
                    if($ending[2] !== '.ti.' && $ending[2] !== '.n.' ){continue 2;}
                    break;
                case '.v:base.':
                    if($ending[2] !== '.v.'){continue 2;}
                    break;
                default:
                    continue 2;
                    break;
            }

            $endingLen = mb_strlen($ending[0], "UTF-8");
            $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
            if ($wordEnd === $ending[0]) {
                //匹配成功
                $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
                //尝试sandhi
                //TODO 加两个sandhi
                $hasSandhi = false;
                foreach ($case->union as $sandhi) {
                    $sandhiLen = mb_strlen($sandhi[0],'UTF-8');
                    $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
                    if ($sandhiEnd === $sandhi[0]) {
                        $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
                        $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
                        if($count){
                            $hasSandhi = true;
                            $newWord[] = ['word'=>$sandhiWord,
                                'ending'=>$ending[1],
                                'type'=>'.un.',
                                'grammar'=>'',
                                'factors'=>"{$word}+{$sandhi[2]}",
                                'count'=>$count->count,
                                'bold'=>$count->bold
                                ];
                                //添加一个去掉ti的数据
                            if($sandhi[2] === 'iti'){
                                $newWord[] = ['word'=>mb_substr($sandhiWord,0,-2,'UTF-8'),
                                    'ending'=>$ending[1],
                                    'grammar'=>$ending[3],
                                    'factors'=>"{$base}+[{$ending[1]}]",
                                    'count'=>$count->count,
                                    'bold'=>$count->bold
                                ];
                            }
                        }
                    }
                }
                $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
                if($count || $hasSandhi){
                    $newWord[] = ['word'=>$word,
                                  'ending'=>$ending[1],
                                  'grammar'=>$ending[3],
                                  'factors'=>"{$base}+[{$ending[1]}]",
                                  'count'=>$count?$count->count:0,
                                  'bold'=>$count?$count->bold:0
                                ];
                }
            }
        }

        return $newWord;
	}

    private function endingMatch($base,$ending,$array=null){
        $case = new CaseEnding();
        $output = array();
        $endingLen = mb_strlen($ending[0], "UTF-8");
        $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
        if ($wordEnd === $ending[0]) {
            //匹配成功
            $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
            if(is_array($array)){
                if(!isset($array[$word])){
                    $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
                }
            }else{
                $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
            }
            if(isset($count) && $count){
                $output[$word] = ["count"=>$count->count,"bold"=>$count->bold];
            }else{
                $output[$word] = false;
            }

            //尝试sandhi
            //TODO 加两个sandhi
            foreach ($case->union as $sandhi) {
                $sandhiLen = strlen($sandhi[0]);
                $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
                if ($sandhiEnd === $sandhi[0]) {
                    $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
                    if(is_array($array)){
                        if(!isset($array[$sandhiWord])){
                            $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
                        }
                    }else{
                        $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
                    }
                    if(isset($count) && $count){
                        $output[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
                    }else{
                        $output[$sandhiWord] = false;
                    }
                }
            }
        }
        return $output;
    }
	/**
     * 从词干到单词的变化
     *
     * @return void
     */
	public function BaseToWord($base,$confidence=0.5){
        $newWord = array();
        $case = new CaseEnding();
        foreach ($case->ending as  $ending) {
            # code...
            if($ending[4]<$confidence){
                continue;
            }
            /*
            $matched = $this->endingMatch($base,$ending,$newWord);
            foreach ($matched as $key => $new) {
                $newWord[$key] = $new;
            }
            */

            $endingLen = mb_strlen($ending[0], "UTF-8");
            $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
            if ($wordEnd === $ending[0]) {
                //匹配成功
                $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
                if(!isset($newWord[$word])){
                    $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
                    if($count){
                        $newWord[$word] = ["count"=>$count->count,"bold"=>$count->bold];
                    }else{
                        $newWord[$word] = false;
                    }
                }
                //尝试sandhi
                //TODO 加两个sandhi
                foreach ($case->union as $sandhi) {
                    $sandhiLen = mb_strlen($sandhi[0],'UTF-8');
                    $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
                    if ($sandhiEnd === $sandhi[0]) {
                        $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
                        if(!isset($newWord[$sandhiWord])){
                            $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
                            if($count){
                                $newWord[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
                            }else{
                                $newWord[$sandhiWord] = false;
                            }
                        }
                    }
                }
            }

        }
        $result = [];
        foreach ($newWord as $key => $value) {
            # code...
            if($value !== false){
                $result[] = ['word'=>$key,'ending',"count"=>$value["count"],"bold"=>$value["bold"]];
            }
        }
        return $result;
	}

	/**
     * 从单词到词干的变化
     * 小蝌蚪找妈妈
     * @return void
     */
	public function WordToBase($word,$deep=1,$verify=true){
		$newWords = array();
		$newBase = array();
		$input[$word] = true;
		$case = new CaseEnding();
		for ($i=0; $i < $deep; $i++) {
			# code...
			foreach ($input as $currWord => $status) {
				# code...
				if($status){
					$input[$currWord] = false;
					foreach ($case->ending as  $ending) {
						# code...
                        if($ending[4] < 0.5){
                            continue;
                        }
						$endingLen = mb_strlen($ending[1], "UTF-8");
						$wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
						if ($wordEnd === $ending[1]) {
							//匹配成功
							$base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
							if(!isset($newBase[$base])){
								$newBase[$base] = array();
							}
							array_push($newBase[$base],[
								'word'=>$currWord,
								'type'=>$ending[2],
								'grammar'=>$ending[3],
								'parent'=>$base,
								'factors'=>"{$base}+[{$ending[1]}]",
								'confidence'=>$ending[4],
							]);
						}
					}
				}
			}
			foreach ($newBase as $currWord => $value) {
				# 把新词加入列表
				if(!isset($input[$currWord])){
					$input[$currWord] = true;
				}
			}
		}

		if($verify){
			$output = array();
			foreach ($newBase as $base => $rows) {
				# code...
				if(($verify = $this->VerifyBase($base,$rows)) !== false){
					if(count($verify)>0){
						$output[$base] = $verify;
					}
				}
			}
			if(count($output)==0){
				//如果验证失败 输出最可能的结果
				$short = 10000;
				$shortBase = "";
				foreach ($newBase as $base => $rows) {
					if(mb_strlen($base,"UTF-8") < $short){
						$short = mb_strlen($base,"UTF-8");
						$shortBase = $base;
					}
				}
				foreach ($newBase as $base => $rows) {
					if($base == $shortBase){
						$output[$base] = $rows;
					}
				}
			}
			return $output;
		}else{
			return $newBase;
		}


	}
	/**
	 * 验证base在字典中是否存在
	 */
	public function VerifyBase($base,$rows){
		#
		$output = array();
		$dictWords = UserDict::where('word',$base)->select(['type','grammar'])->groupBy(['type','grammar'])->get();
		if(count($dictWords)>0){
			$newBase[$base] = 1;
			$case = array();
			//字典中这个拼写的单词的语法信息
			foreach ($dictWords as $value) {
				# code...
				$case["{$value->type}{$value->grammar}"] = 1;
			}
			foreach ($rows as $value) {
				//根据输入的猜测的type,grammar拼接合理的 parent 语法信息
				switch ($value['type']) {
					case '.n.':
						$parentType = '.n:base.';
						break;
					case '.ti.':
						$parentType = '.ti:base.';
						break;
					case '.v.':
						$parentType = '.v:base.';
						break;
					default:
						$parentType = '';
						break;
				}
				if(!empty($value['grammar']) && $value['type'] !== ".v."){
					$arrGrammar = explode('$',$value['grammar']);
					$parentType .=  $arrGrammar[0];
				}
				# 只保存语法信息合理的数据
				if(isset($case[$parentType])){
					array_push($output,$value);
				}
			}
			return $output;
		}else{
			return false;
		}
	}
}