﻿//  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
//
//  Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
//  Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;

namespace NMeCab.Core
{
    public class Tokenizer : IDisposable
    {
        #region Const

        private const string SysDicFile = "sys.dic";
        private const string UnkDicFile = "unk.dic";
        private const int DAResultSize = 512;
        private const int DefaltMaxGroupingSize = 24;
        private const string BosKey = "BOS/EOS";

        #endregion

        #region Field

        private MeCabDictionary[] dic;
        private readonly MeCabDictionary unkDic = new MeCabDictionary();
        private string bosFeature;
        private string unkFeature;
        private Token[][] unkTokens;
        private CharInfo space;
        private readonly CharProperty property = new CharProperty();
        private int maxGroupingSize;

        #endregion

        #region Open/Clear

        public void Open(MeCabParam param)
        {
            this.dic = new MeCabDictionary[param.UserDic.Length + 1];

            string prefix = param.DicDir;

            this.property.Open(prefix);

            this.unkDic.Open(Path.Combine(prefix, UnkDicFile));
            if (this.unkDic.Type != DictionaryType.Unk)
                throw new MeCabInvalidFileException("not a unk dictionary", this.unkDic.FileName);

            MeCabDictionary sysDic = new MeCabDictionary();
            sysDic.Open(Path.Combine(prefix, SysDicFile));
            if (sysDic.Type != DictionaryType.Sys)
                throw new MeCabInvalidFileException("not a system dictionary", sysDic.FileName);
            this.dic [0] = sysDic;

            for (int i = 0; i < param.UserDic.Length; i++)
            {
                MeCabDictionary d = new MeCabDictionary();
                d.Open(Path.Combine(prefix, param.UserDic [i]));
                if (d.Type != DictionaryType.Usr)
                    throw new MeCabInvalidFileException("not a user dictionary", d.FileName);
                if (!sysDic.IsCompatible(d))
                    throw new MeCabInvalidFileException("incompatible dictionary", d.FileName);
                this.dic [i + 1] = d;
            }

            this.unkTokens = new Token[this.property.Size][];
            for (int i = 0; i < this.unkTokens.Length; i++)
            {
                string key = this.property.Name(i);
                DoubleArray.ResultPair n = this.unkDic.ExactMatchSearch(key);
                if (n.Value == -1)
                    throw new MeCabInvalidFileException("cannot find UNK category: " + key, this.unkDic.FileName);
                this.unkTokens [i] = this.unkDic.GetToken(n);
            }

            this.space = this.property.GetCharInfo(' ');

            this.bosFeature = param.BosFeature;
            this.unkFeature = param.UnkFeature;

            this.maxGroupingSize = param.MaxGroupingSize;
            if (this.maxGroupingSize <= 0)
                this.maxGroupingSize = DefaltMaxGroupingSize;
        }

        #endregion

        #region Lookup

        public MeCabNode Lookup(string str)
        {
            CharInfo cInfo = new CharInfo();
            MeCabNode resultNode = null;
            int cLen = 0;

            var beginIndex2 = property.SeekToOtherType(str, this.space, ref cInfo, ref cLen);

            var daResults = new DoubleArray.ResultPair[DAResultSize];

            foreach (MeCabDictionary it in this.dic)
            {
                int n = it.CommonPrefixSearch(str.Substring(beginIndex2), daResults, DAResultSize);

                for (int i = 0; i < n; i++)
                {
                    Token[] token = it.GetToken(daResults [i]);
                    for (int j = 0; j < token.Length; j++)
                    {
                        MeCabNode newNode = this.GetNewNode();
                        this.ReadNodeInfo(it, token [j], newNode);
                        //newNode.Token = token[j];
                        newNode.Length = daResults [i].Length;
                        newNode.RLength = beginIndex2 + daResults [i].Length;
                        newNode.Surface = str.Substring(beginIndex2, daResults [i].Length - beginIndex2);
                        newNode.Stat = MeCabNodeStat.Nor;
                        newNode.CharType = cInfo.DefaultType;
                        newNode.BNext = resultNode;
                        resultNode = newNode;
                    }
                }
            }

            if (resultNode != null && !cInfo.Invoke)
                return resultNode;

            var beginIndex3 = beginIndex2 + 1;
            int? groupBeginIndex3 = null;

            if (cInfo.Group)
            {
                var tmp = beginIndex3;
                CharInfo fail = new CharInfo();
                beginIndex3 = this.property.SeekToOtherType(str.Substring(beginIndex3), cInfo, ref fail, ref cLen);
                if (cLen <= maxGroupingSize)
                    this.AddUnknown(ref resultNode, cInfo, str.Substring(0, beginIndex3), beginIndex2);
                groupBeginIndex3 = beginIndex3;
                beginIndex3 = tmp;
            }

            for (int i = 1; i <= cInfo.Length; i++)
            {
                if (beginIndex3 == groupBeginIndex3)
                    continue;
                cLen = i;
                this.AddUnknown(ref resultNode, cInfo, str.Substring(0, beginIndex3), beginIndex2);
                if (!cInfo.IsKindOf(this.property.GetCharInfo(str [beginIndex3])))
                    break;
                beginIndex3 += 1;
            }

            if (resultNode == null)
                this.AddUnknown(ref resultNode, cInfo, str.Substring(0, beginIndex3), beginIndex2);

            return resultNode;
        }

        private void ReadNodeInfo(MeCabDictionary dic, Token token, MeCabNode node)
        {
            node.LCAttr = token.LcAttr;
            node.RCAttr = token.RcAttr;
            node.PosId = token.PosId;
            node.WCost = token.WCost;
            //node.Token = token;
            //node.Feature = dic.GetFuture(token); //この段階では素性情報を取得しない
            node.SetFeature(token.Feature, dic); //そのかわり遅延取得を可能にする
        }

        private void AddUnknown(ref MeCabNode resultNode, CharInfo cInfo, string str, int beginIndex2)
        {
            Token[] token = this.unkTokens [cInfo.DefaultType];
            for (int i = 0; i < token.Length; i++)
            {
                MeCabNode newNode = this.GetNewNode();
                this.ReadNodeInfo(this.unkDic, token [i], newNode);
                newNode.CharType = cInfo.DefaultType;
                newNode.Surface = str.Substring(beginIndex2);
                newNode.Length = str.Length - beginIndex2;
                newNode.RLength = str.Length;
                newNode.BNext = resultNode;
                newNode.Stat = MeCabNodeStat.Unk;
                if (this.unkFeature != null)
                    newNode.Feature = this.unkFeature;
                resultNode = newNode;
            }
        }

        #endregion

        #region Get Node

        public MeCabNode GetBosNode()
        {
            MeCabNode bosNode = this.GetNewNode();
            bosNode.Surface = BosKey; // dummy
            bosNode.Feature = this.bosFeature;
            bosNode.IsBest = true;
            bosNode.Stat = MeCabNodeStat.Bos;
            return bosNode;
        }

        public MeCabNode GetEosNode()
        {
            MeCabNode eosNode = this.GetBosNode(); // same
            eosNode.Stat = MeCabNodeStat.Eos;
            return eosNode;
        }

        public MeCabNode GetNewNode()
        {
            MeCabNode node = new MeCabNode();
            return node;
        }

        #endregion

        #region Dispose

        private bool disposed;

        public void Dispose()
        {
            this.Dispose(true);
            GC.SuppressFinalize(this);
        }

        protected virtual void Dispose(bool disposing)
        {
            if (disposed)
                return;

            if (disposing)
            {
                if (this.dic != null)
                    foreach (MeCabDictionary d in this.dic)
                        if (d != null)
                            d.Dispose();

                if (this.unkDic != null)
                    this.unkDic.Dispose();
            }

            this.disposed = true;
        }

        ~Tokenizer ()
        {
            this.Dispose(false);
        }

        #endregion
    }
}
