iapt-platform
/
mint
peilaus alkaen https://github.com/iapt-platform/mint.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
							<?php

namespace App\Tools;

use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Log;
use App\Models\UserDict;
use App\Models\WordIndex;


class CaseMan
{
    /**
     * Create a new class instance.
     *
     * @return void
     */
    public function __construct()
    {
        return;
    }

    /**
     * 从词干到单词的变化
     *
     * @return void
     */
    public function Declension($base, $type = null, $grammar = '', $confidence = 0.5)
    {
        $newWord = array();
        $case = new CaseEnding();
        foreach ($case->ending as  $ending) {
            # code...
            if ($ending[4] < $confidence) {
                continue;
            }

            switch ($type) {
                case '.n:base.':
                    if ($ending[2] !== '.n.' || strpos($ending[3], $grammar) !== 0) {
                        continue 2;
                    }
                    break;
                case '.ti:base.':
                    if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') {
                        continue 2;
                    }
                    break;
                case '.adj:base.':
                    if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') {
                        continue 2;
                    }
                    break;
                case '.v:base.':
                    if ($ending[2] !== '.v.') {
                        continue 2;
                    }
                    break;
                default:
                    continue 2;
                    break;
            }

            $endingLen = mb_strlen($ending[0], "UTF-8");
            $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
            if ($wordEnd === $ending[0]) {
                //匹配成功
                $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
                //尝试sandhi
                //TODO 加两个sandhi
                $hasSandhi = false;
                foreach ($case->union as $sandhi) {
                    $sandhiLen = mb_strlen($sandhi[0], 'UTF-8');
                    $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
                    if ($sandhiEnd === $sandhi[0]) {
                        $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
                        $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
                        if ($count) {
                            $hasSandhi = true;
                            $newWord[] = [
                                'word' => $sandhiWord,
                                'ending' => $ending[1],
                                'type' => '.un.',
                                'grammar' => '',
                                'factors' => "{$word}+{$sandhi[2]}",
                                'count' => $count->count,
                                'bold' => $count->bold
                            ];
                            //添加一个去掉ti的数据
                            if ($sandhi[2] === 'iti') {
                                $newWord[] = [
                                    'word' => mb_substr($sandhiWord, 0, -2, 'UTF-8'),
                                    'ending' => $ending[1],
                                    'grammar' => $ending[3],
                                    'factors' => "{$base}+[{$ending[1]}]",
                                    'count' => $count->count,
                                    'bold' => $count->bold
                                ];
                            }
                        }
                    }
                }
                $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
                if ($count || $hasSandhi) {
                    $newWord[] = [
                        'word' => $word,
                        'ending' => $ending[1],
                        'grammar' => $ending[3],
                        'factors' => "{$base}+[{$ending[1]}]",
                        'count' => $count ? $count->count : 0,
                        'bold' => $count ? $count->bold : 0
                    ];
                }
            }
        }

        return $newWord;
    }

    private function endingMatch($base, $ending, $array = null)
    {
        $case = new CaseEnding();
        $output = array();
        $endingLen = mb_strlen($ending[0], "UTF-8");
        $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
        if ($wordEnd === $ending[0]) {
            //匹配成功
            $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
            if (is_array($array)) {
                if (!isset($array[$word])) {
                    $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
                }
            } else {
                $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
            }
            if (isset($count) && $count) {
                $output[$word] = ["count" => $count->count, "bold" => $count->bold];
            } else {
                $output[$word] = false;
            }

            //尝试sandhi
            //TODO 加两个sandhi
            foreach ($case->union as $sandhi) {
                $sandhiLen = strlen($sandhi[0]);
                $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
                if ($sandhiEnd === $sandhi[0]) {
                    $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
                    if (is_array($array)) {
                        if (!isset($array[$sandhiWord])) {
                            $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
                        }
                    } else {
                        $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
                    }
                    if (isset($count) && $count) {
                        $output[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold];
                    } else {
                        $output[$sandhiWord] = false;
                    }
                }
            }
        }
        return $output;
    }
    /**
     * 从词干到单词的变化
     *
     * @return array
     */
    public function BaseToWord($base, $confidence = 0.5)
    {
        $newWord = array();
        $case = new CaseEnding();
        foreach ($case->ending as  $ending) {
            # code...
            if ($ending[4] < $confidence) {
                continue;
            }
            /*
            $matched = $this->endingMatch($base,$ending,$newWord);
            foreach ($matched as $key => $new) {
                $newWord[$key] = $new;
            }
            */

            $endingLen = mb_strlen($ending[0], "UTF-8");
            $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
            if ($wordEnd === $ending[0]) {
                //匹配成功
                $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
                if (!isset($newWord[$word])) {
                    $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
                    if ($count) {
                        $newWord[$word] = ["count" => $count->count, "bold" => $count->bold];
                    } else {
                        $newWord[$word] = false;
                    }
                }
                //尝试sandhi
                //TODO 加两个sandhi
                foreach ($case->union as $sandhi) {
                    $sandhiLen = mb_strlen($sandhi[0], 'UTF-8');
                    $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
                    if ($sandhiEnd === $sandhi[0]) {
                        $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
                        if (!isset($newWord[$sandhiWord])) {
                            $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
                            if ($count) {
                                $newWord[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold];
                            } else {
                                $newWord[$sandhiWord] = false;
                            }
                        }
                    }
                }
            }
        }
        $result = [];
        foreach ($newWord as $key => $value) {
            # code...
            if ($value !== false) {
                $result[] = ['word' => $key, 'ending', "count" => $value["count"], "bold" => $value["bold"]];
            }
        }
        return $result;
    }

    /**
     * 从单词到词干的变化
     * 小蝌蚪找妈妈
     * @param  string  $word 输入
     * @param  int  $deep 搜索深度
     * @param  boolean  $verify 是否验证单词存在
     * @return array
     */
    public function WordToBase($word, $deep = 1, $verify = true)
    {
        $newWords = array();
        $newBase = array();
        $input[$word] = true;
        $case = new CaseEnding();
        for ($i = 0; $i < $deep; $i++) {
            # code...
            foreach ($input as $currWord => $status) {
                # code...
                if ($status) {
                    $input[$currWord] = false;
                    foreach ($case->ending as  $ending) {
                        # code...
                        if ($ending[4] < 0.5) {
                            continue;
                        }
                        $endingLen = mb_strlen($ending[1], "UTF-8");
                        $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
                        if ($wordEnd === $ending[1]) {
                            //匹配成功
                            $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
                            if (!isset($newBase[$base])) {
                                $newBase[$base] = array();
                            }
                            $info = [
                                'word' => $currWord,
                                'type' => $ending[2],
                                'grammar' => $ending[3],
                                'parent' => $base,
                                'factors' => "{$base}+[{$ending[1]}]",
                                'confidence' => $ending[4],
                            ];
                            array_push($newBase[$base], $info);
                            if ($ending[2] === '.n.') {
                                $info['type'] = '.ti.';
                                array_push($newBase[$base], $info);
                                $info['type'] = '.adj.';
                                array_push($newBase[$base], $info);
                            }
                            if ($ending[2] === '.ti.') {
                                $info['type'] = '.adj.';
                                array_push($newBase[$base], $info);
                            }
                        }
                    }
                }
            }
            foreach ($newBase as $currWord => $value) {
                # 把新词加入列表
                if (!isset($input[$currWord])) {
                    $input[$currWord] = true;
                }
            }
        }

        if ($verify) {
            $output = array();
            foreach ($newBase as $base => $rows) {
                # code...
                if (($verify = $this->VerifyBase($base, $rows)) !== false) {
                    $output[$base] = $verify;
                }
            }
            if (count($output) == 0) {
                //如果验证失败 输出最可能的结果
                $short = 10000;
                $shortBase = "";
                foreach ($newBase as $base => $rows) {
                    if (mb_strlen($base, "UTF-8") < $short) {
                        $short = mb_strlen($base, "UTF-8");
                        $shortBase = $base;
                    }
                }
                foreach ($newBase as $base => $rows) {
                    if ($base == $shortBase) {
                        $output[$base] = $rows;
                    }
                }
            }
            return $output;
        } else {
            return $newBase;
        }
    }
    /**
     * 验证base在字典中是否存在
     */
    public function VerifyBase($base, $rows)
    {
        #
        $output = array();
        $dictWords = UserDict::where('word', $base)
            ->select(['type', 'grammar'])
            ->groupBy(['type', 'grammar'])
            ->get();
        if (count($dictWords) > 0) {
            $newBase[$base] = 1;
            $case = array();
            //字典中这个拼写的单词的语法信息
            foreach ($dictWords as $value) {
                if ($value->type === '.n.') {
                    $arrGrammar = explode('$', $value->grammar);
                    $case[$value->type . $arrGrammar[0]] = 1;
                } else {
                    $case[$value->type] = 1;
                }
            }
            foreach ($rows as $value) {
                //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
                switch ($value['type']) {
                    case '.n.':
                        $parentType = '.n:base.';
                        break;
                    case '.ti.':
                        $parentType = '.ti:base.';
                        break;
                    case '.v.':
                        $parentType = '.v:base.';
                        break;
                    default:
                        $parentType = '';
                        break;
                }
                if (!empty($value['grammar']) && $value['type'] === ".n.") {
                    $arrGrammar = explode('$', $value['grammar']);
                    $parentType .=  $arrGrammar[0];
                }
                # 只保存语法信息合理的数据
                if (isset($case[$parentType])) {
                    array_push($output, $value);
                }
            }
            return $output;
        } else {
            return false;
        }
    }
}