add subtitle language detection

This commit is contained in:
Luke Pulverenti 2017-06-17 18:59:17 -04:00
parent c9d7eb9b04
commit 0e7cbb0465
76 changed files with 2256 additions and 26 deletions

View file

@ -317,6 +317,23 @@
<Compile Include="ScheduledTasks\WeeklyTrigger.cs" />
<Compile Include="Serialization\JsonSerializer.cs" />
<Compile Include="Serialization\XmlSerializer.cs" />
<Compile Include="TextEncoding\NLangDetect\Detector.cs" />
<Compile Include="TextEncoding\NLangDetect\DetectorFactory.cs" />
<Compile Include="TextEncoding\NLangDetect\ErrorCode.cs" />
<Compile Include="TextEncoding\NLangDetect\Extensions\CharExtensions.cs" />
<Compile Include="TextEncoding\NLangDetect\Extensions\RandomExtensions.cs" />
<Compile Include="TextEncoding\NLangDetect\Extensions\StringExtensions.cs" />
<Compile Include="TextEncoding\NLangDetect\Extensions\UnicodeBlock.cs" />
<Compile Include="TextEncoding\NLangDetect\GenProfile.cs" />
<Compile Include="TextEncoding\NLangDetect\InternalException.cs" />
<Compile Include="TextEncoding\NLangDetect\Language.cs" />
<Compile Include="TextEncoding\NLangDetect\LanguageDetector.cs" />
<Compile Include="TextEncoding\NLangDetect\NLangDetectException.cs" />
<Compile Include="TextEncoding\NLangDetect\ProbVector.cs" />
<Compile Include="TextEncoding\NLangDetect\Utils\LangProfile.cs" />
<Compile Include="TextEncoding\NLangDetect\Utils\Messages.cs" />
<Compile Include="TextEncoding\NLangDetect\Utils\NGram.cs" />
<Compile Include="TextEncoding\NLangDetect\Utils\TagExtractor.cs" />
<Compile Include="TextEncoding\TextEncoding.cs" />
<Compile Include="TextEncoding\TextEncodingDetect.cs" />
<Compile Include="TextEncoding\UniversalDetector\CharsetDetector.cs" />
@ -368,7 +385,62 @@
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
<None Include="TextEncoding\NLangDetect\Profiles\afr" />
<None Include="TextEncoding\NLangDetect\Profiles\ara" />
<None Include="TextEncoding\NLangDetect\Profiles\bul" />
<None Include="TextEncoding\NLangDetect\Profiles\ben" />
<None Include="TextEncoding\NLangDetect\Profiles\ces" />
<None Include="TextEncoding\NLangDetect\Profiles\dan" />
<None Include="TextEncoding\NLangDetect\Profiles\deu" />
<None Include="TextEncoding\NLangDetect\Profiles\ell" />
<None Include="TextEncoding\NLangDetect\Profiles\eng" />
<None Include="TextEncoding\NLangDetect\Profiles\spa" />
<None Include="TextEncoding\NLangDetect\Profiles\est" />
<None Include="TextEncoding\NLangDetect\Profiles\fas" />
<None Include="TextEncoding\NLangDetect\Profiles\fin" />
<None Include="TextEncoding\NLangDetect\Profiles\fra" />
<None Include="TextEncoding\NLangDetect\Profiles\guj" />
<None Include="TextEncoding\NLangDetect\Profiles\heb" />
<None Include="TextEncoding\NLangDetect\Profiles\hin" />
<None Include="TextEncoding\NLangDetect\Profiles\hrv" />
<None Include="TextEncoding\NLangDetect\Profiles\hun" />
<None Include="TextEncoding\NLangDetect\Profiles\ind" />
<None Include="TextEncoding\NLangDetect\Profiles\ita" />
<None Include="TextEncoding\NLangDetect\Profiles\jpn" />
<None Include="TextEncoding\NLangDetect\Profiles\kan" />
<None Include="TextEncoding\NLangDetect\Profiles\kor" />
<None Include="TextEncoding\NLangDetect\Profiles\lit" />
<None Include="TextEncoding\NLangDetect\Profiles\lav" />
<None Include="TextEncoding\NLangDetect\Profiles\mkd" />
<None Include="TextEncoding\NLangDetect\Profiles\mal" />
<None Include="TextEncoding\NLangDetect\Profiles\mar" />
<None Include="TextEncoding\NLangDetect\Profiles\nep" />
<None Include="TextEncoding\NLangDetect\Profiles\nld" />
<None Include="TextEncoding\NLangDetect\Profiles\nor" />
<None Include="TextEncoding\NLangDetect\Profiles\pan" />
<None Include="TextEncoding\NLangDetect\Profiles\pol" />
<None Include="TextEncoding\NLangDetect\Profiles\por" />
<None Include="TextEncoding\NLangDetect\Profiles\ron" />
<None Include="TextEncoding\NLangDetect\Profiles\rus" />
<None Include="TextEncoding\NLangDetect\Profiles\slk" />
<None Include="TextEncoding\NLangDetect\Profiles\slv" />
<None Include="TextEncoding\NLangDetect\Profiles\som" />
<None Include="TextEncoding\NLangDetect\Profiles\sqi" />
<None Include="TextEncoding\NLangDetect\Profiles\swe" />
<None Include="TextEncoding\NLangDetect\Profiles\swa" />
<None Include="TextEncoding\NLangDetect\Profiles\tam" />
<None Include="TextEncoding\NLangDetect\Profiles\tel" />
<None Include="TextEncoding\NLangDetect\Profiles\tha" />
<None Include="TextEncoding\NLangDetect\Profiles\tgl" />
<None Include="TextEncoding\NLangDetect\Profiles\tur" />
<None Include="TextEncoding\NLangDetect\Profiles\ukr" />
<None Include="TextEncoding\NLangDetect\Profiles\urd" />
<None Include="TextEncoding\NLangDetect\Profiles\vie" />
<EmbeddedResource Include="TextEncoding\NLangDetect\Profiles\zh-cn" />
<EmbeddedResource Include="TextEncoding\NLangDetect\Profiles\zh-tw" />
<EmbeddedResource Include="TextEncoding\NLangDetect\Utils\messages.properties" />
</ItemGroup>
<ItemGroup />
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.

View file

@ -207,24 +207,6 @@ namespace SharpCifs.Util.Sharpen
return (int)tzone.GetUtcOffset(MillisToDateTimeOffset(date, 0).DateTime).TotalMilliseconds;
}
public static InputStream GetResourceAsStream(this Type type, string name)
{
//Type.`Assembly` property deleted
//string str2 = type.Assembly.GetName().Name + ".resources";
string str2 = type.GetTypeInfo().Assembly.GetName().Name + ".resources";
string[] textArray1 = { str2, ".", type.Namespace, ".", name };
string str = string.Concat(textArray1);
//Type.`Assembly` property deleted
//Stream manifestResourceStream = type.Assembly.GetManifestResourceStream(str);
Stream manifestResourceStream = type.GetTypeInfo().Assembly.GetManifestResourceStream(str);
if (manifestResourceStream == null)
{
return null;
}
return InputStream.Wrap(manifestResourceStream);
}
public static long GetTime(this DateTime dateTime)
{
return new DateTimeOffset(DateTime.SpecifyKind(dateTime, DateTimeKind.Utc), TimeSpan.Zero).ToMillisecondsSinceEpoch();

View file

@ -0,0 +1,371 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using NLangDetect.Core.Extensions;
using NLangDetect.Core.Utils;
namespace NLangDetect.Core
{
public class Detector
{
private const double _AlphaDefault = 0.5;
private const double _AlphaWidth = 0.05;
private const int _IterationLimit = 1000;
private const double _ProbThreshold = 0.1;
private const double _ConvThreshold = 0.99999;
private const int _BaseFreq = 10000;
private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
private readonly Dictionary<string, ProbVector> _wordLangProbMap;
private readonly List<string> _langlist;
private StringBuilder _text;
private double[] _langprob;
private double _alpha = _AlphaDefault;
private const int _trialsCount = 7;
private int _maxTextLength = 10000;
private double[] _priorMap;
private int? _seed;
#region Constructor(s)
public Detector(DetectorFactory factory)
{
_wordLangProbMap = factory.WordLangProbMap;
_langlist = factory.Langlist;
_text = new StringBuilder();
_seed = factory.Seed;
}
#endregion
#region Public methods
public void SetAlpha(double alpha)
{
_alpha = alpha;
}
public void SetPriorMap(Dictionary<string, double> priorMap)
{
_priorMap = new double[_langlist.Count];
double sump = 0;
for (int i = 0; i < _priorMap.Length; i++)
{
string lang = _langlist[i];
if (priorMap.ContainsKey(lang))
{
double p = priorMap[lang];
if (p < 0)
{
throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
}
_priorMap[i] = p;
sump += p;
}
}
if (sump <= 0)
{
throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
}
for (int i = 0; i < _priorMap.Length; i++)
{
_priorMap[i] /= sump;
}
}
public void SetMaxTextLength(int max_text_length)
{
_maxTextLength = max_text_length;
}
// TODO IMM HI: TextReader?
public void Append(StreamReader streamReader)
{
var buf = new char[_maxTextLength / 2];
while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
{
int length = streamReader.Read(buf, 0, buf.Length);
Append(new string(buf, 0, length));
}
}
public void Append(string text)
{
text = _UrlRegex.Replace(text, " ");
text = _MailRegex.Replace(text, " ");
char pre = '\0';
for (int i = 0; i < text.Length && i < _maxTextLength; i++)
{
char c = NGram.Normalize(text[i]);
if (c != ' ' || pre != ' ')
{
_text.Append(c);
}
pre = c;
}
}
private void CleanText()
{
int latinCount = 0, nonLatinCount = 0;
for (int i = 0; i < _text.Length; i++)
{
char c = _text[i];
if (c <= 'z' && c >= 'A')
{
latinCount++;
}
else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
{
nonLatinCount++;
}
}
if (latinCount * 2 < nonLatinCount)
{
var textWithoutLatin = new StringBuilder();
for (int i = 0; i < _text.Length; i++)
{
char c = _text[i];
if (c > 'z' || c < 'A')
{
textWithoutLatin.Append(c);
}
}
_text = textWithoutLatin;
}
}
public string Detect()
{
List<Language> probabilities = GetProbabilities();
return
probabilities.Count > 0
? probabilities[0].Name
: null;
}
public List<Language> GetProbabilities()
{
if (_langprob == null)
{
DetectBlock();
}
List<Language> list = SortProbability(_langprob);
return list;
}
#endregion
#region Private helper methods
private static double NormalizeProb(double[] probs)
{
double maxp = 0, sump = 0;
sump += probs.Sum();
for (int i = 0; i < probs.Length; i++)
{
double p = probs[i] / sump;
if (maxp < p)
{
maxp = p;
}
probs[i] = p;
}
return maxp;
}
private static string UnicodeEncode(string word)
{
var resultSb = new StringBuilder();
foreach (char ch in word)
{
if (ch >= '\u0080')
{
string st = string.Format("{0:x}", 0x10000 + ch);
while (st.Length < 4)
{
st = "0" + st;
}
resultSb
.Append("\\u")
.Append(st.SubSequence(1, 5));
}
else
{
resultSb.Append(ch);
}
}
return resultSb.ToString();
}
private void DetectBlock()
{
CleanText();
List<string> ngrams = ExtractNGrams();
if (ngrams.Count == 0)
{
throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
}
_langprob = new double[_langlist.Count];
Random rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
for (int t = 0; t < _trialsCount; t++)
{
double[] prob = InitProbability();
// TODO IMM HI: verify it works
double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
for (int i = 0; ; i++)
{
int r = rand.Next(ngrams.Count);
UpdateLangProb(prob, ngrams[r], alpha);
if (i % 5 == 0)
{
if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
{
break;
}
}
}
for (int j = 0; j < _langprob.Length; j++)
{
_langprob[j] += prob[j] / _trialsCount;
}
}
}
private double[] InitProbability()
{
var prob = new double[_langlist.Count];
if (_priorMap != null)
{
for (int i = 0; i < prob.Length; i++)
{
prob[i] = _priorMap[i];
}
}
else
{
for (int i = 0; i < prob.Length; i++)
{
prob[i] = 1.0 / _langlist.Count;
}
}
return prob;
}
private List<string> ExtractNGrams()
{
var list = new List<string>();
NGram ngram = new NGram();
for (int i = 0; i < _text.Length; i++)
{
ngram.AddChar(_text[i]);
for (int n = 1; n <= NGram.GramsCount; n++)
{
string w = ngram.Get(n);
if (w != null && _wordLangProbMap.ContainsKey(w))
{
list.Add(w);
}
}
}
return list;
}
private void UpdateLangProb(double[] prob, string word, double alpha)
{
if (word == null || !_wordLangProbMap.ContainsKey(word))
{
return;
}
ProbVector langProbMap = _wordLangProbMap[word];
double weight = alpha / _BaseFreq;
for (int i = 0; i < prob.Length; i++)
{
prob[i] *= weight + langProbMap[i];
}
}
private List<Language> SortProbability(double[] prob)
{
var list = new List<Language>();
for (int j = 0; j < prob.Length; j++)
{
double p = prob[j];
if (p > _ProbThreshold)
{
for (int i = 0; i <= list.Count; i++)
{
if (i == list.Count || list[i].Probability < p)
{
list.Insert(i, new Language(_langlist[j], p));
break;
}
}
}
}
return list;
}
#endregion
}
}

View file

@ -0,0 +1,127 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using NLangDetect.Core.Utils;
using MediaBrowser.Model.Serialization;
using System.Linq;
namespace NLangDetect.Core
{
public class DetectorFactory
{
public Dictionary<string, ProbVector> WordLangProbMap;
public List<string> Langlist;
private static readonly DetectorFactory _instance = new DetectorFactory();
#region Constructor(s)
private DetectorFactory()
{
WordLangProbMap = new Dictionary<string, ProbVector>();
Langlist = new List<string>();
}
#endregion
#region Public methods
public static void LoadProfiles(IJsonSerializer json)
{
var assembly = typeof(DetectorFactory).Assembly;
var names = assembly.GetManifestResourceNames()
.Where(i => i.IndexOf("NLangDetect.Profiles", StringComparison.Ordinal) != -1)
.ToList();
var index = 0;
foreach (var name in names)
{
using (var stream = assembly.GetManifestResourceStream(name))
{
var langProfile = (LangProfile)json.DeserializeFromStream(stream, typeof(LangProfile));
AddProfile(langProfile, index);
}
index++;
}
}
public static Detector Create()
{
return CreateDetector();
}
public static Detector Create(double alpha)
{
Detector detector = CreateDetector();
detector.SetAlpha(alpha);
return detector;
}
public static void SetSeed(int? seed)
{
_instance.Seed = seed;
}
#endregion
#region Internal methods
internal static void AddProfile(LangProfile profile, int index)
{
var lang = profile.name;
if (_instance.Langlist.Contains(lang))
{
throw new NLangDetectException("duplicate the same language profile", ErrorCode.DuplicateLangError);
}
_instance.Langlist.Add(lang);
foreach (string word in profile.freq.Keys)
{
if (!_instance.WordLangProbMap.ContainsKey(word))
{
_instance.WordLangProbMap.Add(word, new ProbVector());
}
double prob = (double)profile.freq[word] / profile.n_words[word.Length - 1];
_instance.WordLangProbMap[word][index] = prob;
}
}
internal static void Clear()
{
_instance.Langlist.Clear();
_instance.WordLangProbMap.Clear();
}
#endregion
#region Private helper methods
private static Detector CreateDetector()
{
if (_instance.Langlist.Count == 0)
{
throw new NLangDetectException("need to load profiles", ErrorCode.NeedLoadProfileError);
}
return new Detector(_instance);
}
#endregion
#region Properties
public int? Seed { get; private set; }
#endregion
}
}

View file

@ -0,0 +1,15 @@
namespace NLangDetect.Core
{
public enum ErrorCode
{
NoTextError,
FormatError,
FileLoadError,
DuplicateLangError,
NeedLoadProfileError,
CantDetectError,
CantOpenTrainData,
TrainDataFormatError,
InitParamError,
}
}

View file

@ -0,0 +1,374 @@
using System;
namespace NLangDetect.Core.Extensions
{
public static class CharExtensions
{
private const int MIN_CODE_POINT = 0x000000;
private const int MAX_CODE_POINT = 0x10ffff;
private static readonly int[] _unicodeBlockStarts =
{
#region Unicode block starts
0x0000, // Basic Latin
0x0080, // Latin-1 Supplement
0x0100, // Latin Extended-A
0x0180, // Latin Extended-B
0x0250, // IPA Extensions
0x02B0, // Spacing Modifier Letters
0x0300, // Combining Diacritical Marks
0x0370, // Greek and Coptic
0x0400, // Cyrillic
0x0500, // Cyrillic Supplementary
0x0530, // Armenian
0x0590, // Hebrew
0x0600, // Arabic
0x0700, // Syriac
0x0750, // unassigned
0x0780, // Thaana
0x07C0, // unassigned
0x0900, // Devanagari
0x0980, // Bengali
0x0A00, // Gurmukhi
0x0A80, // Gujarati
0x0B00, // Oriya
0x0B80, // Tamil
0x0C00, // Telugu
0x0C80, // Kannada
0x0D00, // Malayalam
0x0D80, // Sinhala
0x0E00, // Thai
0x0E80, // Lao
0x0F00, // Tibetan
0x1000, // Myanmar
0x10A0, // Georgian
0x1100, // Hangul Jamo
0x1200, // Ethiopic
0x1380, // unassigned
0x13A0, // Cherokee
0x1400, // Unified Canadian Aboriginal Syllabics
0x1680, // Ogham
0x16A0, // Runic
0x1700, // Tagalog
0x1720, // Hanunoo
0x1740, // Buhid
0x1760, // Tagbanwa
0x1780, // Khmer
0x1800, // Mongolian
0x18B0, // unassigned
0x1900, // Limbu
0x1950, // Tai Le
0x1980, // unassigned
0x19E0, // Khmer Symbols
0x1A00, // unassigned
0x1D00, // Phonetic Extensions
0x1D80, // unassigned
0x1E00, // Latin Extended Additional
0x1F00, // Greek Extended
0x2000, // General Punctuation
0x2070, // Superscripts and Subscripts
0x20A0, // Currency Symbols
0x20D0, // Combining Diacritical Marks for Symbols
0x2100, // Letterlike Symbols
0x2150, // Number Forms
0x2190, // Arrows
0x2200, // Mathematical Operators
0x2300, // Miscellaneous Technical
0x2400, // Control Pictures
0x2440, // Optical Character Recognition
0x2460, // Enclosed Alphanumerics
0x2500, // Box Drawing
0x2580, // Block Elements
0x25A0, // Geometric Shapes
0x2600, // Miscellaneous Symbols
0x2700, // Dingbats
0x27C0, // Miscellaneous Mathematical Symbols-A
0x27F0, // Supplemental Arrows-A
0x2800, // Braille Patterns
0x2900, // Supplemental Arrows-B
0x2980, // Miscellaneous Mathematical Symbols-B
0x2A00, // Supplemental Mathematical Operators
0x2B00, // Miscellaneous Symbols and Arrows
0x2C00, // unassigned
0x2E80, // CJK Radicals Supplement
0x2F00, // Kangxi Radicals
0x2FE0, // unassigned
0x2FF0, // Ideographic Description Characters
0x3000, // CJK Symbols and Punctuation
0x3040, // Hiragana
0x30A0, // Katakana
0x3100, // Bopomofo
0x3130, // Hangul Compatibility Jamo
0x3190, // Kanbun
0x31A0, // Bopomofo Extended
0x31C0, // unassigned
0x31F0, // Katakana Phonetic Extensions
0x3200, // Enclosed CJK Letters and Months
0x3300, // CJK Compatibility
0x3400, // CJK Unified Ideographs Extension A
0x4DC0, // Yijing Hexagram Symbols
0x4E00, // CJK Unified Ideographs
0xA000, // Yi Syllables
0xA490, // Yi Radicals
0xA4D0, // unassigned
0xAC00, // Hangul Syllables
0xD7B0, // unassigned
0xD800, // High Surrogates
0xDB80, // High Private Use Surrogates
0xDC00, // Low Surrogates
0xE000, // Private Use
0xF900, // CJK Compatibility Ideographs
0xFB00, // Alphabetic Presentation Forms
0xFB50, // Arabic Presentation Forms-A
0xFE00, // Variation Selectors
0xFE10, // unassigned
0xFE20, // Combining Half Marks
0xFE30, // CJK Compatibility Forms
0xFE50, // Small Form Variants
0xFE70, // Arabic Presentation Forms-B
0xFF00, // Halfwidth and Fullwidth Forms
0xFFF0, // Specials
0x10000, // Linear B Syllabary
0x10080, // Linear B Ideograms
0x10100, // Aegean Numbers
0x10140, // unassigned
0x10300, // Old Italic
0x10330, // Gothic
0x10350, // unassigned
0x10380, // Ugaritic
0x103A0, // unassigned
0x10400, // Deseret
0x10450, // Shavian
0x10480, // Osmanya
0x104B0, // unassigned
0x10800, // Cypriot Syllabary
0x10840, // unassigned
0x1D000, // Byzantine Musical Symbols
0x1D100, // Musical Symbols
0x1D200, // unassigned
0x1D300, // Tai Xuan Jing Symbols
0x1D360, // unassigned
0x1D400, // Mathematical Alphanumeric Symbols
0x1D800, // unassigned
0x20000, // CJK Unified Ideographs Extension B
0x2A6E0, // unassigned
0x2F800, // CJK Compatibility Ideographs Supplement
0x2FA20, // unassigned
0xE0000, // Tags
0xE0080, // unassigned
0xE0100, // Variation Selectors Supplement
0xE01F0, // unassigned
0xF0000, // Supplementary Private Use Area-A
0x100000, // Supplementary Private Use Area-B
#endregion
};
private static readonly UnicodeBlock?[] _unicodeBlocks =
{
#region Unicode blocks
UnicodeBlock.BasicLatin,
UnicodeBlock.Latin1Supplement,
UnicodeBlock.LatinExtendedA,
UnicodeBlock.LatinExtendedB,
UnicodeBlock.IpaExtensions,
UnicodeBlock.SpacingModifierLetters,
UnicodeBlock.CombiningDiacriticalMarks,
UnicodeBlock.Greek,
UnicodeBlock.Cyrillic,
UnicodeBlock.CyrillicSupplementary,
UnicodeBlock.Armenian,
UnicodeBlock.Hebrew,
UnicodeBlock.Arabic,
UnicodeBlock.Syriac,
null,
UnicodeBlock.Thaana,
null,
UnicodeBlock.Devanagari,
UnicodeBlock.Bengali,
UnicodeBlock.Gurmukhi,
UnicodeBlock.Gujarati,
UnicodeBlock.Oriya,
UnicodeBlock.Tamil,
UnicodeBlock.Telugu,
UnicodeBlock.Kannada,
UnicodeBlock.Malayalam,
UnicodeBlock.Sinhala,
UnicodeBlock.Thai,
UnicodeBlock.Lao,
UnicodeBlock.Tibetan,
UnicodeBlock.Myanmar,
UnicodeBlock.Georgian,
UnicodeBlock.HangulJamo,
UnicodeBlock.Ethiopic,
null,
UnicodeBlock.Cherokee,
UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
UnicodeBlock.Ogham,
UnicodeBlock.Runic,
UnicodeBlock.Tagalog,
UnicodeBlock.Hanunoo,
UnicodeBlock.Buhid,
UnicodeBlock.Tagbanwa,
UnicodeBlock.Khmer,
UnicodeBlock.Mongolian,
null,
UnicodeBlock.Limbu,
UnicodeBlock.TaiLe,
null,
UnicodeBlock.KhmerSymbols,
null,
UnicodeBlock.PhoneticExtensions,
null,
UnicodeBlock.LatinExtendedAdditional,
UnicodeBlock.GreekExtended,
UnicodeBlock.GeneralPunctuation,
UnicodeBlock.SuperscriptsAndSubscripts,
UnicodeBlock.CurrencySymbols,
UnicodeBlock.CombiningMarksForSymbols,
UnicodeBlock.LetterlikeSymbols,
UnicodeBlock.NumberForms,
UnicodeBlock.Arrows,
UnicodeBlock.MathematicalOperators,
UnicodeBlock.MiscellaneousTechnical,
UnicodeBlock.ControlPictures,
UnicodeBlock.OpticalCharacterRecognition,
UnicodeBlock.EnclosedAlphanumerics,
UnicodeBlock.BoxDrawing,
UnicodeBlock.BlockElements,
UnicodeBlock.GeometricShapes,
UnicodeBlock.MiscellaneousSymbols,
UnicodeBlock.Dingbats,
UnicodeBlock.MiscellaneousMathematicalSymbolsA,
UnicodeBlock.SupplementalArrowsA,
UnicodeBlock.BraillePatterns,
UnicodeBlock.SupplementalArrowsB,
UnicodeBlock.MiscellaneousMathematicalSymbolsB,
UnicodeBlock.SupplementalMathematicalOperators,
UnicodeBlock.MiscellaneousSymbolsAndArrows,
null,
UnicodeBlock.CjkRadicalsSupplement,
UnicodeBlock.KangxiRadicals,
null,
UnicodeBlock.IdeographicDescriptionCharacters,
UnicodeBlock.CjkSymbolsAndPunctuation,
UnicodeBlock.Hiragana,
UnicodeBlock.Katakana,
UnicodeBlock.Bopomofo,
UnicodeBlock.HangulCompatibilityJamo,
UnicodeBlock.Kanbun,
UnicodeBlock.BopomofoExtended,
null,
UnicodeBlock.KatakanaPhoneticExtensions,
UnicodeBlock.EnclosedCjkLettersAndMonths,
UnicodeBlock.CjkCompatibility,
UnicodeBlock.CjkUnifiedIdeographsExtensionA,
UnicodeBlock.YijingHexagramSymbols,
UnicodeBlock.CjkUnifiedIdeographs,
UnicodeBlock.YiSyllables,
UnicodeBlock.YiRadicals,
null,
UnicodeBlock.HangulSyllables,
null,
UnicodeBlock.HighSurrogates,
UnicodeBlock.HighPrivateUseSurrogates,
UnicodeBlock.LowSurrogates,
UnicodeBlock.PrivateUseArea,
UnicodeBlock.CjkCompatibilityIdeographs,
UnicodeBlock.AlphabeticPresentationForms,
UnicodeBlock.ArabicPresentationFormsA,
UnicodeBlock.VariationSelectors,
null,
UnicodeBlock.CombiningHalfMarks,
UnicodeBlock.CjkCompatibilityForms,
UnicodeBlock.SmallFormVariants,
UnicodeBlock.ArabicPresentationFormsB,
UnicodeBlock.HalfwidthAndFullwidthForms,
UnicodeBlock.Specials,
UnicodeBlock.LinearBSyllabary,
UnicodeBlock.LinearBIdeograms,
UnicodeBlock.AegeanNumbers,
null,
UnicodeBlock.OldItalic,
UnicodeBlock.Gothic,
null,
UnicodeBlock.Ugaritic,
null,
UnicodeBlock.Deseret,
UnicodeBlock.Shavian,
UnicodeBlock.Osmanya,
null,
UnicodeBlock.CypriotSyllabary,
null,
UnicodeBlock.ByzantineMusicalSymbols,
UnicodeBlock.MusicalSymbols,
null,
UnicodeBlock.TaiXuanJingSymbols,
null,
UnicodeBlock.MathematicalAlphanumericSymbols,
null,
UnicodeBlock.CjkUnifiedIdeographsExtensionB,
null,
UnicodeBlock.CjkCompatibilityIdeographsSupplement,
null,
UnicodeBlock.Tags,
null,
UnicodeBlock.VariationSelectorsSupplement,
null,
UnicodeBlock.SupplementaryPrivateUseAreaA,
UnicodeBlock.SupplementaryPrivateUseAreaB,
#endregion
};
#region Public methods
/// <remarks>
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
/// </remarks>
public static UnicodeBlock? GetUnicodeBlock(this char ch)
{
int codePoint = ch;
if (!IsValidCodePoint(codePoint))
{
throw new ArgumentException("Argument is not a valid code point.", "ch");
}
int top, bottom, current;
bottom = 0;
top = _unicodeBlockStarts.Length;
current = top / 2;
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
while (top - bottom > 1)
{
if (codePoint >= _unicodeBlockStarts[current])
{
bottom = current;
}
else
{
top = current;
}
current = (top + bottom) / 2;
}
return _unicodeBlocks[current];
}
#endregion
#region Private helper methods
private static bool IsValidCodePoint(int codePoint)
{
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
}
#endregion
}
}

View file

@ -0,0 +1,51 @@
using System;
namespace NLangDetect.Core.Extensions
{
public static class RandomExtensions
{
private const double _Epsilon = 2.22044604925031E-15;
private static readonly object _mutex = new object();
private static double _nextNextGaussian;
private static bool _hasNextNextGaussian;
/// <summary>
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
/// </summary>
/// <remarks>
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
/// </remarks>
public static double NextGaussian(this Random random)
{
lock (_mutex)
{
if (_hasNextNextGaussian)
{
_hasNextNextGaussian = false;
return _nextNextGaussian;
}
double v1, v2, s;
do
{
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
s = v1 * v1 + v2 * v2;
}
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
_nextNextGaussian = v2 * multiplier;
_hasNextNextGaussian = true;
return v1 * multiplier;
}
}
}
}

View file

@ -0,0 +1,25 @@
using System;
namespace NLangDetect.Core.Extensions
{
public static class StringExtensions
{
/// <summary>
/// Returns a new character sequence that is a subsequence of this sequence. The subsequence starts with the character at the specified index and ends with the character at index end - 1. The length of the returned sequence is end - start, so if start == end then an empty sequence is returned.
/// </summary>
/// <param name="s"></param>
/// <param name="start">the start index, inclusive</param>
/// <param name="end">the end index, exclusive</param>
/// <returns>the specified subsequence</returns>
/// <exception cref="IndexOutOfRangeException"> if start or end are negative, if end is greater than length(), or if start is greater than end</exception>
public static string SubSequence(this string s, int start, int end)
{
if (start < 0) throw new ArgumentOutOfRangeException("start", "Argument must not be negative.");
if (end < 0) throw new ArgumentOutOfRangeException("end", "Argument must not be negative.");
if (end > s.Length) throw new ArgumentOutOfRangeException("end", "Argument must not be greater than the input string's length.");
if (start > end) throw new ArgumentOutOfRangeException("start", "Argument must not be greater than the 'end' argument.");
return s.Substring(start, end - start);
}
}
}

View file

@ -0,0 +1,131 @@
namespace NLangDetect.Core.Extensions
{
public enum UnicodeBlock
{
BasicLatin,
Latin1Supplement,
LatinExtendedA,
LatinExtendedB,
IpaExtensions,
SpacingModifierLetters,
CombiningDiacriticalMarks,
Greek,
Cyrillic,
CyrillicSupplementary,
Armenian,
Hebrew,
Arabic,
Syriac,
Thaana,
Devanagari,
Bengali,
Gurmukhi,
Gujarati,
Oriya,
Tamil,
Telugu,
Kannada,
Malayalam,
Sinhala,
Thai,
Lao,
Tibetan,
Myanmar,
Georgian,
HangulJamo,
Ethiopic,
Cherokee,
UnifiedCanadianAboriginalSyllabics,
Ogham,
Runic,
Tagalog,
Hanunoo,
Buhid,
Tagbanwa,
Khmer,
Mongolian,
Limbu,
TaiLe,
KhmerSymbols,
PhoneticExtensions,
LatinExtendedAdditional,
GreekExtended,
GeneralPunctuation,
SuperscriptsAndSubscripts,
CurrencySymbols,
CombiningMarksForSymbols,
LetterlikeSymbols,
NumberForms,
Arrows,
MathematicalOperators,
MiscellaneousTechnical,
ControlPictures,
OpticalCharacterRecognition,
EnclosedAlphanumerics,
BoxDrawing,
BlockElements,
GeometricShapes,
MiscellaneousSymbols,
Dingbats,
MiscellaneousMathematicalSymbolsA,
SupplementalArrowsA,
BraillePatterns,
SupplementalArrowsB,
MiscellaneousMathematicalSymbolsB,
SupplementalMathematicalOperators,
MiscellaneousSymbolsAndArrows,
CjkRadicalsSupplement,
KangxiRadicals,
IdeographicDescriptionCharacters,
CjkSymbolsAndPunctuation,
Hiragana,
Katakana,
Bopomofo,
HangulCompatibilityJamo,
Kanbun,
BopomofoExtended,
KatakanaPhoneticExtensions,
EnclosedCjkLettersAndMonths,
CjkCompatibility,
CjkUnifiedIdeographsExtensionA,
YijingHexagramSymbols,
CjkUnifiedIdeographs,
YiSyllables,
YiRadicals,
HangulSyllables,
HighSurrogates,
HighPrivateUseSurrogates,
LowSurrogates,
PrivateUseArea,
CjkCompatibilityIdeographs,
AlphabeticPresentationForms,
ArabicPresentationFormsA,
VariationSelectors,
CombiningHalfMarks,
CjkCompatibilityForms,
SmallFormVariants,
ArabicPresentationFormsB,
HalfwidthAndFullwidthForms,
Specials,
LinearBSyllabary,
LinearBIdeograms,
AegeanNumbers,
OldItalic,
Gothic,
Ugaritic,
Deseret,
Shavian,
Osmanya,
CypriotSyllabary,
ByzantineMusicalSymbols,
MusicalSymbols,
TaiXuanJingSymbols,
MathematicalAlphanumericSymbols,
CjkUnifiedIdeographsExtensionB,
CjkCompatibilityIdeographsSupplement,
Tags,
VariationSelectorsSupplement,
SupplementaryPrivateUseAreaA,
SupplementaryPrivateUseAreaB,
}
}

View file

@ -0,0 +1,67 @@
using System;
using System.IO.Compression;
using System.Xml;
using NLangDetect.Core.Utils;
using System.IO;
namespace NLangDetect.Core
{
// TODO IMM HI: xml reader not tested
public static class GenProfile
{
#region Public methods
public static LangProfile load(string lang, string file)
{
LangProfile profile = new LangProfile(lang);
TagExtractor tagextractor = new TagExtractor("abstract", 100);
Stream inputStream = null;
try
{
inputStream = File.OpenRead(file);
string extension = Path.GetExtension(file) ?? "";
if (extension.ToUpper() == ".GZ")
{
inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
}
using (XmlReader xmlReader = XmlReader.Create(inputStream))
{
while (xmlReader.Read())
{
switch (xmlReader.NodeType)
{
case XmlNodeType.Element:
tagextractor.SetTag(xmlReader.Name);
break;
case XmlNodeType.Text:
tagextractor.Add(xmlReader.Value);
break;
case XmlNodeType.EndElement:
tagextractor.CloseTag(profile);
break;
}
}
}
}
finally
{
if (inputStream != null)
{
inputStream.Close();
}
}
Console.WriteLine(lang + ": " + tagextractor.Count);
return profile;
}
#endregion
}
}

View file

@ -0,0 +1,22 @@
using System;
namespace NLangDetect.Core
{
[Serializable]
public class InternalException : Exception
{
#region Constructor(s)
public InternalException(string message, Exception innerException)
: base(message, innerException)
{
}
public InternalException(string message)
: this(message, null)
{
}
#endregion
}
}

View file

@ -0,0 +1,45 @@
using System.Globalization;
namespace NLangDetect.Core
{
// TODO IMM HI: name??
public class Language
{
#region Constructor(s)
public Language(string name, double probability)
{
Name = name;
Probability = probability;
}
#endregion
#region Object overrides
public override string ToString()
{
if (Name == null)
{
return "";
}
return
string.Format(
CultureInfo.InvariantCulture.NumberFormat,
"{0}:{1:0.000000}",
Name,
Probability);
}
#endregion
#region Properties
public string Name { get; set; }
public double Probability { get; set; }
#endregion
}
}

View file

@ -0,0 +1,37 @@
using System;
using MediaBrowser.Model.Serialization;
namespace NLangDetect.Core
{
// TODO IMM HI: change to non-static class
// TODO IMM HI: hide other, unnecassary classes via internal?
public static class LanguageDetector
{
private const double _DefaultAlpha = 0.5;
#region Public methods
public static void Initialize(IJsonSerializer json)
{
DetectorFactory.LoadProfiles(json);
}
public static void Release()
{
DetectorFactory.Clear();
}
public static string DetectLanguage(string plainText)
{
if (string.IsNullOrEmpty(plainText)) { throw new ArgumentException("Argument can't be null nor empty.", "plainText"); }
Detector detector = DetectorFactory.Create(_DefaultAlpha);
detector.Append(plainText);
return detector.Detect();
}
#endregion
}
}

View file

@ -0,0 +1,23 @@
using System;
namespace NLangDetect.Core
{
public class NLangDetectException : Exception
{
#region Constructor(s)
public NLangDetectException(string message, ErrorCode errorCode)
: base(message)
{
ErrorCode = errorCode;
}
#endregion
#region Properties
public ErrorCode ErrorCode { get; private set; }
#endregion
}
}

View file

@ -0,0 +1,35 @@
using System;
using System.Collections.Generic;
namespace NLangDetect.Core
{
public class ProbVector
{
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
public double this[int key]
{
get
{
double value;
return _dict.TryGetValue(key, out value) ? value : 0.0;
}
set
{
if (Math.Abs(value) < double.Epsilon)
{
if (_dict.ContainsKey(key))
{
_dict.Remove(key);
}
return;
}
_dict[key] = value;
}
}
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,118 @@
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace NLangDetect.Core.Utils
{
public class LangProfile
{
private const int MinimumFreq = 2;
private const int LessFreqRatio = 100000;
public string name { get; set; }
public Dictionary<string, int> freq { get; set; }
public int[] n_words { get; set; }
#region Constructor(s)
public LangProfile()
{
freq = new Dictionary<string, int>();
n_words = new int[NGram.GramsCount];
}
public LangProfile(string name)
{
this.name = name;
freq = new Dictionary<string, int>();
n_words = new int[NGram.GramsCount];
}
#endregion
#region Public methods
public void Add(string gram)
{
if (name == null || gram == null) return; // Illegal
int len = gram.Length;
if (len < 1 || len > NGram.GramsCount) return; // Illegal
n_words[len - 1]++;
if (freq.ContainsKey(gram))
{
freq[gram] = freq[gram] + 1;
}
else
{
freq.Add(gram, 1);
}
}
public void OmitLessFreq()
{
if (name == null) return; // Illegal
int threshold = n_words[0] / LessFreqRatio;
if (threshold < MinimumFreq) threshold = MinimumFreq;
ICollection<string> keys = freq.Keys;
int roman = 0;
// TODO IMM HI: move up?
Regex regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
List<string> keysToRemove = new List<string>();
foreach (string key in keys)
{
int count = freq[key];
if (count <= threshold)
{
n_words[key.Length - 1] -= count;
keysToRemove.Add(key);
}
else
{
if (regex1.IsMatch(key))
{
roman += count;
}
}
}
foreach (string keyToRemove in keysToRemove)
{
freq.Remove(keyToRemove);
}
// roman check
keysToRemove = new List<string>();
if (roman < n_words[0] / 3)
{
ICollection<string> keys2 = freq.Keys;
// TODO IMM HI: move up?
Regex regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);
foreach (string key in keys2)
{
int count = freq[key];
if (regex2.IsMatch(key))
{
n_words[key.Length - 1] -= count;
keysToRemove.Add(key);
}
}
foreach (string keyToRemove in keysToRemove)
{
freq.Remove(keyToRemove);
}
}
}
#endregion
}
}

View file

@ -0,0 +1,91 @@
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Reflection;
using System.Text.RegularExpressions;
using System.Linq;
using System;
namespace NLangDetect.Core.Utils
{
public static class Messages
{
private static readonly Dictionary<string, string> _messages;
static Messages()
{
_messages = LoadMessages();
}
public static string getString(string key)
{
string value;
return
_messages.TryGetValue(key, out value)
? value
: string.Format("!{0}!", key);
}
private static Dictionary<string, string> LoadMessages()
{
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1) ;
Stream messagesStream =
typeof(Messages).Assembly
.GetManifestResourceStream(manifestName);
if (messagesStream == null)
{
throw new InternalException(string.Format("Couldn't get embedded resource named '{0}'.", manifestName));
}
using (messagesStream)
using (var sr = new StreamReader(messagesStream))
{
var messages = new Dictionary<string, string>();
while (!sr.EndOfStream)
{
string line = sr.ReadLine();
if (string.IsNullOrEmpty(line))
{
continue;
}
string[] keyValue = line.Split('=');
if (keyValue.Length != 2)
{
throw new InternalException(string.Format("Invalid format of the 'Messages.properties' resource. Offending line: '{0}'.", line.Trim()));
}
string key = keyValue[0];
string value = UnescapeUnicodeString(keyValue[1]);
messages.Add(key, value);
}
return messages;
}
}
/// <remarks>
/// Taken from: http://stackoverflow.com/questions/1615559/converting-unicode-strings-to-escaped-ascii-string/1615860#1615860
/// </remarks>
private static string UnescapeUnicodeString(string s)
{
if (s == null)
{
return null;
}
return
Regex.Replace(
s,
@"\\u(?<Value>[a-zA-Z0-9]{4})",
match => ((char)int.Parse(match.Groups["Value"].Value, NumberStyles.HexNumber)).ToString());
}
}
}

View file

@ -0,0 +1,330 @@
// TODO IMM HI: check which classes can be made internal?
using System.Collections.Generic;
using System.Text;
using NLangDetect.Core.Extensions;
namespace NLangDetect.Core.Utils
{
public class NGram
{
public const int GramsCount = 3;
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
private static readonly string[] CjkClass =
{
#region CJK classes
Messages.getString("NGram.KANJI_1_0"),
Messages.getString("NGram.KANJI_1_2"),
Messages.getString("NGram.KANJI_1_4"),
Messages.getString("NGram.KANJI_1_8"),
Messages.getString("NGram.KANJI_1_11"),
Messages.getString("NGram.KANJI_1_12"),
Messages.getString("NGram.KANJI_1_13"),
Messages.getString("NGram.KANJI_1_14"),
Messages.getString("NGram.KANJI_1_16"),
Messages.getString("NGram.KANJI_1_18"),
Messages.getString("NGram.KANJI_1_22"),
Messages.getString("NGram.KANJI_1_27"),
Messages.getString("NGram.KANJI_1_29"),
Messages.getString("NGram.KANJI_1_31"),
Messages.getString("NGram.KANJI_1_35"),
Messages.getString("NGram.KANJI_2_0"),
Messages.getString("NGram.KANJI_2_1"),
Messages.getString("NGram.KANJI_2_4"),
Messages.getString("NGram.KANJI_2_9"),
Messages.getString("NGram.KANJI_2_10"),
Messages.getString("NGram.KANJI_2_11"),
Messages.getString("NGram.KANJI_2_12"),
Messages.getString("NGram.KANJI_2_13"),
Messages.getString("NGram.KANJI_2_15"),
Messages.getString("NGram.KANJI_2_16"),
Messages.getString("NGram.KANJI_2_18"),
Messages.getString("NGram.KANJI_2_21"),
Messages.getString("NGram.KANJI_2_22"),
Messages.getString("NGram.KANJI_2_23"),
Messages.getString("NGram.KANJI_2_28"),
Messages.getString("NGram.KANJI_2_29"),
Messages.getString("NGram.KANJI_2_30"),
Messages.getString("NGram.KANJI_2_31"),
Messages.getString("NGram.KANJI_2_32"),
Messages.getString("NGram.KANJI_2_35"),
Messages.getString("NGram.KANJI_2_36"),
Messages.getString("NGram.KANJI_2_37"),
Messages.getString("NGram.KANJI_2_38"),
Messages.getString("NGram.KANJI_3_1"),
Messages.getString("NGram.KANJI_3_2"),
Messages.getString("NGram.KANJI_3_3"),
Messages.getString("NGram.KANJI_3_4"),
Messages.getString("NGram.KANJI_3_5"),
Messages.getString("NGram.KANJI_3_8"),
Messages.getString("NGram.KANJI_3_9"),
Messages.getString("NGram.KANJI_3_11"),
Messages.getString("NGram.KANJI_3_12"),
Messages.getString("NGram.KANJI_3_13"),
Messages.getString("NGram.KANJI_3_15"),
Messages.getString("NGram.KANJI_3_16"),
Messages.getString("NGram.KANJI_3_18"),
Messages.getString("NGram.KANJI_3_19"),
Messages.getString("NGram.KANJI_3_22"),
Messages.getString("NGram.KANJI_3_23"),
Messages.getString("NGram.KANJI_3_27"),
Messages.getString("NGram.KANJI_3_29"),
Messages.getString("NGram.KANJI_3_30"),
Messages.getString("NGram.KANJI_3_31"),
Messages.getString("NGram.KANJI_3_32"),
Messages.getString("NGram.KANJI_3_35"),
Messages.getString("NGram.KANJI_3_36"),
Messages.getString("NGram.KANJI_3_37"),
Messages.getString("NGram.KANJI_3_38"),
Messages.getString("NGram.KANJI_4_0"),
Messages.getString("NGram.KANJI_4_9"),
Messages.getString("NGram.KANJI_4_10"),
Messages.getString("NGram.KANJI_4_16"),
Messages.getString("NGram.KANJI_4_17"),
Messages.getString("NGram.KANJI_4_18"),
Messages.getString("NGram.KANJI_4_22"),
Messages.getString("NGram.KANJI_4_24"),
Messages.getString("NGram.KANJI_4_28"),
Messages.getString("NGram.KANJI_4_34"),
Messages.getString("NGram.KANJI_4_39"),
Messages.getString("NGram.KANJI_5_10"),
Messages.getString("NGram.KANJI_5_11"),
Messages.getString("NGram.KANJI_5_12"),
Messages.getString("NGram.KANJI_5_13"),
Messages.getString("NGram.KANJI_5_14"),
Messages.getString("NGram.KANJI_5_18"),
Messages.getString("NGram.KANJI_5_26"),
Messages.getString("NGram.KANJI_5_29"),
Messages.getString("NGram.KANJI_5_34"),
Messages.getString("NGram.KANJI_5_39"),
Messages.getString("NGram.KANJI_6_0"),
Messages.getString("NGram.KANJI_6_3"),
Messages.getString("NGram.KANJI_6_9"),
Messages.getString("NGram.KANJI_6_10"),
Messages.getString("NGram.KANJI_6_11"),
Messages.getString("NGram.KANJI_6_12"),
Messages.getString("NGram.KANJI_6_16"),
Messages.getString("NGram.KANJI_6_18"),
Messages.getString("NGram.KANJI_6_20"),
Messages.getString("NGram.KANJI_6_21"),
Messages.getString("NGram.KANJI_6_22"),
Messages.getString("NGram.KANJI_6_23"),
Messages.getString("NGram.KANJI_6_25"),
Messages.getString("NGram.KANJI_6_28"),
Messages.getString("NGram.KANJI_6_29"),
Messages.getString("NGram.KANJI_6_30"),
Messages.getString("NGram.KANJI_6_32"),
Messages.getString("NGram.KANJI_6_34"),
Messages.getString("NGram.KANJI_6_35"),
Messages.getString("NGram.KANJI_6_37"),
Messages.getString("NGram.KANJI_6_39"),
Messages.getString("NGram.KANJI_7_0"),
Messages.getString("NGram.KANJI_7_3"),
Messages.getString("NGram.KANJI_7_6"),
Messages.getString("NGram.KANJI_7_7"),
Messages.getString("NGram.KANJI_7_9"),
Messages.getString("NGram.KANJI_7_11"),
Messages.getString("NGram.KANJI_7_12"),
Messages.getString("NGram.KANJI_7_13"),
Messages.getString("NGram.KANJI_7_16"),
Messages.getString("NGram.KANJI_7_18"),
Messages.getString("NGram.KANJI_7_19"),
Messages.getString("NGram.KANJI_7_20"),
Messages.getString("NGram.KANJI_7_21"),
Messages.getString("NGram.KANJI_7_23"),
Messages.getString("NGram.KANJI_7_25"),
Messages.getString("NGram.KANJI_7_28"),
Messages.getString("NGram.KANJI_7_29"),
Messages.getString("NGram.KANJI_7_32"),
Messages.getString("NGram.KANJI_7_33"),
Messages.getString("NGram.KANJI_7_35"),
Messages.getString("NGram.KANJI_7_37"),
#endregion
};
private static readonly Dictionary<char, char> _cjkMap;
private StringBuilder _grams;
private bool _capitalword;
#region Constructor(s)
static NGram()
{
_cjkMap = new Dictionary<char, char>();
foreach (string cjk_list in CjkClass)
{
char representative = cjk_list[0];
for (int i = 0; i < cjk_list.Length; i++)
{
_cjkMap.Add(cjk_list[i], representative);
}
}
}
public NGram()
{
_grams = new StringBuilder(" ");
_capitalword = false;
}
#endregion
#region Public methods
public static char Normalize(char ch)
{
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
if (!unicodeBlock.HasValue)
{
return ch;
}
switch (unicodeBlock.Value)
{
case UnicodeBlock.BasicLatin:
{
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
{
return ' ';
}
break;
}
case UnicodeBlock.Latin1Supplement:
{
if (Latin1Excluded.IndexOf(ch) >= 0)
{
return ' ';
}
break;
}
case UnicodeBlock.GeneralPunctuation:
{
return ' ';
}
case UnicodeBlock.Arabic:
{
if (ch == '\u06cc')
{
return '\u064a';
}
break;
}
case UnicodeBlock.LatinExtendedAdditional:
{
if (ch >= '\u1ea0')
{
return '\u1ec3';
}
break;
}
case UnicodeBlock.Hiragana:
{
return '\u3042';
}
case UnicodeBlock.Katakana:
{
return '\u30a2';
}
case UnicodeBlock.Bopomofo:
case UnicodeBlock.BopomofoExtended:
{
return '\u3105';
}
case UnicodeBlock.CjkUnifiedIdeographs:
{
if (_cjkMap.ContainsKey(ch))
{
return _cjkMap[ch];
}
break;
}
case UnicodeBlock.HangulSyllables:
{
return '\uac00';
}
}
return ch;
}
public void AddChar(char ch)
{
ch = Normalize(ch);
char lastchar = _grams[_grams.Length - 1];
if (lastchar == ' ')
{
_grams = new StringBuilder(" ");
_capitalword = false;
if (ch == ' ') return;
}
else if (_grams.Length >= GramsCount)
{
_grams.Remove(0, 1);
}
_grams.Append(ch);
if (char.IsUpper(ch))
{
if (char.IsUpper(lastchar)) _capitalword = true;
}
else
{
_capitalword = false;
}
}
public string Get(int n)
{
if (_capitalword)
{
return null;
}
int len = _grams.Length;
if (n < 1 || n > 3 || len < n)
{
return null;
}
if (n == 1)
{
char ch = _grams[len - 1];
if (ch == ' ')
{
return null;
}
return ch.ToString();
}
// TODO IMM HI: is ToString() here effective?
return _grams.ToString().SubSequence(len - n, len);
}
#endregion
}
}

View file

@ -0,0 +1,76 @@
using System.Text;
namespace NLangDetect.Core.Utils
{
public class TagExtractor
{
// TODO IMM HI: do the really need to be internal?
internal string Target;
internal int Threshold;
internal StringBuilder StringBuilder;
internal string Tag;
#region Constructor(s)
public TagExtractor(string tag, int threshold)
{
Target = tag;
Threshold = threshold;
Count = 0;
Clear();
}
#endregion
#region Public methods
public void Clear()
{
StringBuilder = new StringBuilder();
Tag = null;
}
public void SetTag(string tag)
{
Tag = tag;
}
public void Add(string line)
{
if (Tag == Target && line != null)
{
StringBuilder.Append(line);
}
}
public void CloseTag(LangProfile profile)
{
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
{
var gram = new NGram();
for (int i = 0; i < StringBuilder.Length; i++)
{
gram.AddChar(StringBuilder[i]);
for (int n = 1; n <= NGram.GramsCount; n++)
{
profile.Add(gram.Get(n));
}
}
Count++;
}
Clear();
}
#endregion
#region Properties
public int Count { get; private set; }
#endregion
}
}

View file

@ -0,0 +1,128 @@
NGram.CJK_KANJI_EXCLUDE=\u0020\uFF08\uFF09
NGram.LATIN1_EXCLUDE=\u00A0\u00AB\u00B0\u00BB
NGram.KANJI_1_0=\u4F7C\u6934
NGram.KANJI_1_2=\u88CF\u95B2
NGram.KANJI_1_4=\u7027\u7DCB
NGram.KANJI_1_8=\u4E80\u4E9C\u4EEE\u5263\u5264\u5270\u52C5\u52E7\u52F2\u53B3\u5449\u58CA\u58CC\u5968\u59C9\u59EB\u5D8B\u5DE3\u5E30\u6075\u622F\u623B\u6255\u629C\u629E\u62DD\u62E1\u633F\u635C\u63FA\u6442\u6589\u658E\u6669\u66A6\u66FD\u6804\u685C\u6B69\u6B6F\u6BBB\u6C37\u6C5A\u6D44\u6E09\u6E0B\u6E13\u6EDD\u713C\u72A0\u731F\u7363\u7A32\u7A42\u7A93\u7ADC\u7C8B\u7C9B\u7DD1\u7E01\u7E04\u7E26\u7E4A\u7E4B\u7E70\u8074\u8107\u8133\u81D3\u820E\u8217\u8358\u83D3\u85AC\u8987\u899A\u8B21\u8B72\u8B83\u8CDB\u9045\u90F7\u91C8\u9271\u9283\u92AD\u9665\u967A\u96A0\u96A3\u96B7\u970A\u983C\u9854\u9855\u99C6\u9A12\u9ED9\u9F62
NGram.KANJI_1_11=\u67D8\u831C
NGram.KANJI_1_12=\u5742\u57FC\u5800
NGram.KANJI_1_13=\u4E3C\u4E98\u4FE3\u4FF5\u5072\u51A8\u53A9\u5451\u546A\u5504\u5516\u55A9\u55B0\u5618\u5642\u565B\u567A\u56A2\u57F4\u5840\u5841\u58F1\u59F6\u5A2F\u5B22\u5B8D\u5DCC\u5EFB\u5F10\u60A9\u60E3\u61D0\u62F6\u63B4\u63BB\u63C3\u6681\u685F\u6955\u6962\u696F\u698A\u698E\u69FB\u6A2B\u6A7F\u6B53\u6BD8\u6D99\u6E07\u7460\u7473\u7560\u7573\u758E\u7690\u7815\u783A\u7962\u7A4F\u7A63\u7AEA\u7BED\u7CA7\u7D18\u7D3A\u7E4D\u8061\u8218\u8276\u82C5\u8597\u85AB\u86CD\u874B\u88FE\u8ACF\u8B90\u8D0B\u8FBF\u9013\u9061\u914E\u9154\u918D\u9190\u91A4\u91B8\u9262\u929A\u92ED\u92F3\u932C\u96EB\u96F0\u976D\u97EE\u981A\u99C4\u9A28\u9AC4\u9B8E\u9C10\u9D0E\u9D5C\u9D8F\u9E78\u9EB9\u9EBA\u9EBF
NGram.KANJI_1_14=\u5F66\u7984\u7985
NGram.KANJI_1_16=\u5861\u7B25\u844E\u9419\u9D07
NGram.KANJI_1_18=\u5039\u514E\u51E7\u51EA\u5301\u5302\u5859\u58F7\u59AC\u5C2D\u5CA8\u5EFC\u6357\u64B9\u67CA\u6802\u6834\u68BC\u6900\u6919\u691B\u69D9\u6AE8\u6D9C\u6E8C\u6F09\u6F45\u701E\u7026\u7114\u72DB\u7577\u75E9\u783F\u7895\u7A50\u7AC3\u7B48\u7B86\u7BAA\u7C7E\u7C82\u7C8D\u7CCE\u7D2C\u7F6B\u7FEB\u8557\u85AE\u86CE\u877F\u8997\u8ACC\u8CB0\u8CCE\u8FE9\u9197\u920E\u9266\u927E\u92F2\u9306\u9453\u9784\u982C\u9834\u99C8\u9BF5\u9C2F\u9D2C
NGram.KANJI_1_22=\u6762\u6A17\u887F
NGram.KANJI_1_27=\u4E21\u4E57\u4ECF\u4F1D\u4FA1\u4FF3\u5024\u50CD\u5150\u5186\u51E6\u52B4\u52B9\u5358\u53CE\u55B6\u56E3\u56F2\u56F3\u570F\u5727\u5869\u5897\u58F2\u5909\u5B9F\u5BDB\u5BFE\u5C02\u5DFB\u5E2F\u5E81\u5E83\u5EC3\u5F3E\u5F93\u5FB3\u5FB4\u5FDC\u60AA\u6226\u6238\u6271\u62E0\u6319\u63B2\u6483\u64AE\u67A0\u67FB\u691C\u697D\u69D8\u6A29\u6B73\u6B74\u6BCE\u6C17\u6CA2\u6D5C\u6E08\u6E80\u702C\u7523\u767A\u770C\u7D4C\u7D75\u7D76\u7D99\u7D9A\u7DCF\u8535\u8846\u89A7\u89B3\u8A33\u8AAC\u8AAD\u8C4A\u8EE2\u8EFD\u8FBA\u8FBC\u9244\u9332\u95A2\u95D8\u96D1\u99C5\u9A13\u9ED2
NGram.KANJI_1_29=\u4F0E\u4FFA\u5036\u53E1\u54B2\u5506\u583A\u5C3B\u5CAC\u5CE0\u5CEF\u6803\u68B6\u6A0B\u6A8E\u73C2\u7551\u7826\u7881\u79B0\u7B39\u8429\u8599\u8FBB\u9162\u95C7\u9688\u96BC\u9AEA\u9DF2
NGram.KANJI_1_31=\u5553\u938C
NGram.KANJI_1_35=\u51B4\u564C\u57DC\u5B2C\u6822\u685D\u690B\u6973\u6C93\u7511\u7887\u7A17\u83D6\u847A\u8494\u8526\u854E\u85C1\u86F8\u88B4\u93A7\u9B92\u9C39\u9C48\u9C52
NGram.KANJI_2_0=\u4E2B\u4EC3\u4F09\u4F57\u4F6F\u4F70\u4FD1\u4FDA\u500C\u5043\u516E\u5189\u5241\u530D\u5310\u5412\u54AB\u54AF\u5514\u5556\u55B1\u561F\u573B\u586D\u587D\u58C5\u58D1\u5914\u5A62\u5A6A\u5AE6\u5B40\u5B5B\u5B70\u5BB8\u5CD2\u5D01\u5D34\u5E11\u5EA0\u5F0B\u5F2D\u5F87\u607F\u621B\u6221\u6289\u63A3\u6452\u646D\u64D8\u652B\u6600\u6631\u6641\u66F7\u6773\u67B8\u67DD\u67DE\u6829\u68FB\u69AD\u6A47\u6C10\u6C68\u6C74\u6C85\u6CD3\u6D31\u6D93\u6D94\u6DB8\u6DBF\u6DC5\u6E6E\u6EA7\u6EB4\u6EC2\u6F2A\u6F2F\u6FB9\u6FC2\u6FDB\u6FEE\u70AF\u70FD\u7166\u726F\u729B\u739F\u73DE\u740A\u746D\u749C\u749F\u74E0\u759D\u75A3\u75CD\u75DE\u7600\u7620\u7688\u7738\u7762\u776B\u777D\u77E3\u781D\u7837\u78A3\u7946\u7B60\u7F44\u7F54\u7F5F\u7FAF\u8026\u807F\u80C4\u80DB\u80ED\u81E7\u824B\u82B7\u82E3\u8392\u846D\u84D3\u8548\u85B9\u86DE\u873F\u8753\u8782\u87AB\u87B3\u87D1\u87E0\u87FE\u8821\u88D8\u88E8\u8913\u891A\u892B\u8983\u8C3F\u8C49\u8C82\u8D6D\u8DE4\u8E1D\u8E1E\u8E7C\u8FE5\u8FE8\u9005\u9035\u9050\u9082\u9083\u9095\u90E2\u911E\u91AE\u91B4\u93D6\u9621\u968D\u96B9\u96D2\u9711\u9713\u973E\u9AB0\u9AB7\u9AE6\u9B03\u9B23\u9EDC\u9EEF
NGram.KANJI_2_1=\u4E82\u4F48\u4F54\u50F9\u5167\u528D\u52DE\u532F\u537B\u53C3\u5433\u555F\u55AE\u56B4\u570D\u5716\u58D3\u58DE\u5920\u5967\u5A1B\u5BEB\u5BEC\u5C08\u5C0D\u5C46\u5C6C\u5CFD\u5E36\u5E6B\u5EC8\u5EF3\u5F48\u5F91\u5F9E\u5FB5\u6046\u60E1\u61F7\u6232\u6236\u64C7\u64CA\u64D4\u64DA\u64F4\u651D\u6578\u65B7\u6649\u6A13\u6A23\u6A6B\u6A94\u6AA2\u6B0A\u6B50\u6B61\u6B72\u6B77\u6B78\u6C92\u6EAB\u6EFF\u6FD5\u6FDF\u71DF\u722D\u72C0\u734E\u737B\u746A\u7522\u773E\u78BC\u7A69\u7C3D\u7CB5\u7D55\u7D72\u7DA0\u7DAB\u7DE3\u7E5E\u7E6A\u7E7C\u7E8C\u8072\u807D\u8085\u812B\u8166\u8173\u81D8\u8209\u820A\u8332\u838A\u840A\u85E5\u860B\u8655\u865B\u88DD\u89BA\u89BD\u89C0\u8AAA\u8B6F\u8B7D\u8B8A\u8B93\u8C50\u8CF4\u8E64\u8F15\u8F49\u8FA6\u8FAD\u9109\u9130\u91AB\u91CB\u92B7\u9304\u9322\u95CA\u96A8\u96AA\u96B1\u96B8\u96D6\u96D9\u96DC\u9748\u975C\u986F\u9918\u99DB\u9A57\u9B25\u9EA5\u9EC3\u9EDE\u9F52
NGram.KANJI_2_4=\u514C\u51AA\u5614\u56AE\u56C2\u582F\u58FA\u5B0C\u5D11\u5DD2\u5DD6\u5E40\u5E5F\u5EEC\u6137\u6417\u6488\u64F2\u652A\u6582\u6689\u689F\u68D7\u69D3\u6A97\u6AB8\u6ABB\u6AC3\u6ADA\u6B7F\u6BB2\u6EA5\u6EC4\u6EF2\u7009\u701D\u7028\u703E\u7165\u71BE\u721B\u7463\u7464\u7469\u7515\u7526\u75FA\u7621\u779E\u79B1\u7A1F\u7AC4\u7AC7\u7B8F\u7BE9\u7D2E\u7D68\u7D8F\u7DB8\u7DBA\u7E46\u7E79\u7F4C\u7F88\u8070\u8073\u8076\u81BE\u82BB\u83A2\u858A\u8591\u861A\u8778\u87EC\u8805\u880D\u893B\u8A1B\u8A25\u8A36\u8A85\u8AA6\u8B17\u8B28\u8CB6\u8CE4\u8D16\u8D1B\u8ECB\u9112\u9214\u9249\u93AC\u9594\u9598\u95BB\u95D5\u965E\u96B4\u97DC\u9821\u9824\u9921\u9952\u9A55\u9A5B\u9B1A\u9C13\u9D09\u9DAF\u9E1A\u9E75\u9F67
NGram.KANJI_2_9=\u4E9F\u4F6C\u4FDE\u4FFE\u5029\u5140\u51A2\u5345\u539D\u53FB\u54C7\u5599\u560E\u561B\u563B\u566C\u5676\u5729\u574D\u57E4\u595A\u598D\u5A1F\u5A25\u5A77\u5AB2\u5AD6\u5BF0\u5C2C\u5CEA\u5E37\u5F08\u6059\u606A\u6096\u609A\u62A8\u6555\u6556\u66E6\u675E\u68E3\u69BB\u6BCB\u6BD3\u6C1F\u6C26\u6C81\u6DC4\u6DDE\u6E32\u6E44\u6E4D\u6F33\u6F7C\u6FA7\u701A\u701B\u715C\u741B\u7428\u7480\u74A8\u7504\u752C\u768B\u76CE\u78CA\u78FA\u79BA\u7C27\u8046\u81FB\u8331\u8393\u83C1\u8403\u8438\u843C\u8446\u85B0\u87D2\u8862\u8DC6\u9074\u9131\u9672\u96EF\u9704\u9706\u977C\u9ABC\u9E92\u9ECF
NGram.KANJI_2_10=\u51BD\u5704\u7350\u73A5
NGram.KANJI_2_11=\u4E15\u4EA2\u4F5A\u50D6\u5349\u53DF\u5484\u5958\u5B34\u5B5A\u5C91\u5E1B\u5F77\u61CB\u61FF\u620C\u620D\u622E\u6248\u6538\u660A\u664F\u678B\u67E9\u69B7\u69C3\u6CB1\u6CD7\u6D5A\u6DAA\u6DC7\u7099\u71EE\u7325\u7425\u7455\u747E\u749E\u75B5\u7678\u7693\u76C2\u77B0\u77BF\u78CB\u7957\u795A\u797A\u7A79\u7B08\u7B75\u7BB4\u7F9A\u7FB2\u7FDF\u80E5\u81BA\u8340\u837C\u8398\u8559\u85A8\u86DF\u8734\u8882\u88F4\u8936\u900D\u907D\u9642\u96C9\u9AFB\u9E9D\u9EBE
NGram.KANJI_2_12=\u5F57\u7940
NGram.KANJI_2_13=\u5191\u7791\u792C\u7D46
NGram.KANJI_2_15=\u5713\u58FD\u5D17\u5D19\u5DBC\u5F4C\u6191\u64A5\u687F\u69AE\u6AFB\u6EEC\u6F3F\u6FE4\u6FF1\u6FFE\u700B\u74CA\u76E1\u76E7\u7926\u792B\u79AE\u7AA9\u7C43\u7C4C\u7C64\u7DBD\u81A0\u856D\u8594\u8606\u8A62\u8AF7\u8CC8\u8CE3\u8D99\u8F1B\u8F3B\u9059\u9127\u9264\u947D\u95A9\u97CB\u980C\u9838\u9846\u99AE\u9A19\u9B06\u9B91\u9F4A\u9F4B
NGram.KANJI_2_16=\u4E69\u4EC4\u4EDF\u4EF3\u4F0B\u4F5E\u5000\u5028\u50E5\u513B\u5157\u51DC\u52D7\u530F\u5379\u53F5\u5471\u5477\u5555\u555C\u557B\u5594\u55B2\u55C9\u560D\u5616\u562E\u5630\u5653\u5657\u566F\u56A8\u56B6\u5820\u5880\u58CE\u58D9\u5950\u5969\u596D\u599E\u59B3\u59CD\u59D2\u5A40\u5AA7\u5ABC\u5AD7\u5AD8\u5B0B\u5B24\u5B38\u5B53\u5C5C\u5D06\u5D47\u5D94\u5D9D\u5E57\u5EC4\u5F46\u5FAC\u60BD\u60D8\u6123\u615D\u615F\u6175\u618A\u61AB\u61E3\u623E\u6308\u636B\u645F\u6519\u6595\u6698\u66B8\u67D9\u6840\u695D\u696E\u6979\u69C1\u69E8\u6AEC\u6AFA\u6B5F\u6CAC\u6CE0\u6CEF\u6D0C\u6D36\u6DD2\u6DD9\u6DE6\u6DEC\u6E5F\u6FA0\u6FEC\u7156\u71C4\u71DC\u71EC\u71FC\u720D\u7230\u7292\u7296\u72A2\u72CE\u7357\u737A\u7380\u7386\u73A8\u73EE\u743F\u74A6\u74CF\u74D4\u74DA\u755A\u75A5\u75B3\u75C2\u75E0\u75F1\u75FF\u7601\u7609\u7646\u7658\u769A\u76B0\u774F\u775C\u778B\u77BD\u77C7\u7843\u787F\u78F4\u79C8\u7A88\u7A95\u7AFD\u7B1E\u7B67\u7B9D\u7BCC\u7C0D\u7C11\u7C37\u7C40\u7C6E\u7CB3\u7CBD\u7D09\u7D31\u7D40\u7D5B\u7D70\u7D91\u7D9E\u7DB0\u7DD9\u7DF9\u7E08\u7E11\u7E1D\u7E35\u7E52\u7FB6\u7FBF\u7FEE\u8012\u801C\u8028\u8052\u8123\u8188\u81C3\u81DA\u81FE\u8210\u82BE\u83A0\u83D4\u8407\u8435\u8477\u849E\u84C6\u84CA\u85F9\u867A\u86B5\u86B6\u86C4\u8706\u8707\u870A\u8768\u87BB\u8831\u8839\u8879\u8921\u8938\u8964\u89A6\u89AC\u8A10\u8A3E\u8AC2\u8ADB\u8AF3\u8B2B\u8B41\u8B4E\u8B5F\u8B6B\u8B92\u8C55\u8C62\u8C73\u8C8A\u8C8D\u8CB2\u8CB3\u8CD2\u8CE1\u8CFB\u8D0D\u8E34\u8E7A\u8E8A\u8ED4\u8EFE\u8F0A\u8F1C\u8F1E\u8F26\u8FAE\u9088\u90C3\u90FE\u9134\u9148\u91D9\u91E9\u9238\u9239\u923D\u924D\u925A\u9296\u92AC\u92BB\u9315\u9319\u931A\u9321\u9370\u9394\u93A2\u93D8\u93E4\u943A\u9477\u9582\u958E\u95A1\u95C8\u95CC\u95D4\u9658\u966C\u970F\u973D\u9744\u975B\u9766\u97A3\u97A6\u97C1\u97C6\u980A\u9837\u9853\u9870\u98AF\u98B3\u98BA\u98E9\u98ED\u9912\u991B\u991E\u993D\u993F\u99D1\u99DF\u9A01\u9A3E\u9A43\u9A4D\u9ACF\u9AE1\u9B22\u9B58\u9C25\u9C3E\u9C54\u9C56\u9D15\u9D23\u9D89\u9DC2\u9DD3\u9E82\u9E8B\u9EA9\u9EE0\u9EF7\u9F07\u9F2F\u9F34\u9F3E\u9F5F\u9F6C
NGram.KANJI_2_18=\u5155\u520E\u55DF\u56C0\u56C1\u5793\u5FD6\u5FF8\u6029\u60FA\u613E\u6147\u615A\u62C8\u6384\u6883\u6894\u68F9\u6AA3\u6AAE\u6AC2\u6E63\u7032\u70A4\u7146\u71FB\u7228\u72F7\u7370\u7441\u74BF\u75B8\u75E3\u7622\u76CD\u7768\u79E3\u7A60\u7B6E\u7BC1\u7C5F\u7D06\u7E2F\u7E39\u8146\u81CF\u8703\u8729\u8737\u87EF\u88D2\u8A22\u8AC4\u8AF6\u8E59\u8F33\u8F42\u9169\u91B1\u9278\u93C3\u93DD\u9460\u946A\u9785\u9AD1\u9B4D\u9B4E\u9C31\u9D12\u9ECC
NGram.KANJI_2_21=\u502A\u544E\u59AE\u59EC\u5D1B\u66A8\u6BD7\u6C76\u6E1D\u70EF\u742A\u7459\u7FE1\u82EF\u8343\u85C9\u8A79\u90DD
NGram.KANJI_2_22=\u4EDE\u4F7B\u504C\u50EE\u52E3\u52F0\u536E\u54A9\u54BB\u54BF\u54C2\u54E6\u550F\u556A\u55E8\u564E\u5664\u5671\u568F\u56DD\u572F\u57A0\u5809\u5924\u59A3\u59A4\u59E3\u5A13\u5A23\u5B51\u5B73\u5C50\u5C8C\u6035\u60C6\u6106\u6215\u62CE\u62FD\u64ED\u6549\u6554\u655D\u659B\u65CE\u65D6\u6615\u6624\u665E\u6677\u669D\u66E9\u6772\u677C\u696B\u6A84\u6AA0\u6BFD\u6C16\u6C86\u6C94\u6CD6\u6D2E\u6D39\u6F78\u6FB6\u705E\u70CA\u7168\u723B\u7256\u7284\u73B3\u740D\u742F\u7498\u74A9\u752D\u75F3\u7634\u768E\u76B4\u76E5\u77A0\u77DC\u781F\u782D\u7AA0\u7BFE\u7FF1\u80AB\u8174\u81EC\u8202\u8222\u8228\u82DC\u8306\u83FD\u8469\u84FF\u859C\u8617\u86B1\u8722\u8C89\u8D67\u8DCE\u8E49\u8E76\u8E87\u8FE2\u8FE4\u8FF8\u9016\u905B\u9174\u982B\u98E7\u9955\u9B32
NGram.KANJI_2_23=\u4F8F\u5055\u524C\u548E\u5583\u594E\u5CB7\u5ED6\u5F5D\u6021\u66B9\u66F0\u6C55\u6C7E\u6C82\u6E2D\u6EC7\u6ED5\u70B3\u71B9\u72C4\u73C0\u7426\u745C\u748B\u7696\u777F\u79A7\u79B9\u7F8C\u8153\u8339\u8386\u8725\u90B5\u9102\u962E\u9716\u97F6
NGram.KANJI_2_28=\u5733\u57D4\u838E\u8FEA
NGram.KANJI_2_29=\u50ED\u5F29\u62EE\u6A9C\u7BC6\u80F1\u8129\u8171\u822B\u8AEB
NGram.KANJI_2_30=\u4EB3\u4F15\u4FB7\u5006\u509A\u50A2\u5102\u5109\u5115\u5137\u5138\u513C\u524B\u524E\u5277\u528A\u52E6\u52FB\u5331\u5436\u5443\u54FD\u5538\u555E\u55C6\u55C7\u5679\u5690\u5695\u56C9\u56D1\u56EA\u588A\u58E2\u5AFB\u5B2A\u5B43\u5B7F\u5BE2\u5C37\u5D27\u5D84\u5D87\u5DD4\u5EC1\u5EDD\u5F12\u5FA0\u60F1\u616B\u61F5\u61F6\u61FE\u62DA\u6371\u6399\u63C0\u6451\u647B\u6493\u64BB\u64BF\u64C4\u64F1\u64F7\u650F\u652C\u665D\u6684\u6688\u66EC\u672E\u68E7\u69A6\u69ED\u69F3\u6A01\u6AAF\u6AE5\u6BA4\u6BAE\u6BAF\u6BC6\u6C08\u6C2C\u6C59\u6D87\u6EBC\u6ECC\u6EF7\u6F6F\u6F80\u6F86\u6FD8\u6FF0\u6FFA\u7006\u7018\u7030\u7051\u7192\u71C9\u71D9\u71F4\u71FE\u7274\u7377\u74A3\u750C\u7613\u7627\u7661\u7662\u7665\u766E\u7671\u7672\u76BA\u775E\u776A\u778C\u78E7\u7955\u7A08\u7AC5\u7B4D\u7C2B\u7C6C\u7CF0\u7D02\u7D1C\u7D73\u7DA2\u7DB5\u7DDE\u7E09\u7E0A\u7E37\u7E43\u7E61\u7E7D\u7E93\u7F3D\u7FF9\u81A9\u8271\u83F8\u84C0\u8514\u85BA\u86A9\u86FB\u879E\u8814\u8836\u889E\u8932\u896A\u896F\u8993\u89B2\u8A15\u8A16\u8A1D\u8A5B\u8A6C\u8A6D\u8A7C\u8AA1\u8AA3\u8AA5\u8B0A\u8B4F\u8B59\u8B96\u8C48\u8C54\u8CBD\u8CFA\u8D13\u8E89\u8E8B\u8EAA\u8EC0\u8EDB\u8EFC\u8F12\u8F1F\u8F3E\u8F45\u8FFA\u9015\u9183\u919E\u91A3\u91D7\u91F5\u9209\u9215\u923E\u9240\u9251\u9257\u927B\u9293\u92A8\u92C5\u92C7\u92F0\u9333\u935A\u9382\u938A\u9398\u93B3\u93D7\u93DF\u93E2\u93FD\u942B\u942E\u9433\u9463\u9470\u9472\u947E\u95D0\u96CB\u97C3\u97CC\u981C\u9839\u986B\u98B6\u98EA\u9909\u991A\u9935\u993E\u9951\u99A5\u99B1\u99D9\u99DD\u99F1\u9A2B\u9A62\u9A65\u9AAF\u9AD2\u9AEF\u9B0D\u9B28\u9B77\u9BFD\u9C49\u9C5F\u9C78\u9D3F\u9D72\u9DD7\u9E1B\u9EB4\u9EF4\u9F66\u9F94
NGram.KANJI_2_31=\u5DBD\u63C6\u6E3E\u7587\u8AF1\u8B5A\u9695
NGram.KANJI_2_32=\u53A5\u589F\u5CD9\u7109\u7F79\u8006\u8654\u8944\u968B\u96CD
NGram.KANJI_2_35=\u4F47\u4F91\u4FCE\u4FDF\u527D\u535E\u55DA\u56A5\u5879\u5A11\u5B7A\u5CAB\u5CF4\u5EBE\u5F7F\u5FA8\u601B\u606B\u60B8\u610D\u6134\u619A\u61FA\u6369\u6523\u65CC\u66C4\u6727\u6968\u6A05\u6A48\u6B59\u6BEC\u6D35\u6D38\u6E19\u701F\u7064\u711C\u716C\u71A8\u71E7\u7258\u743A\u746F\u75BD\u75D9\u75F2\u7669\u766C\u76DE\u7729\u77BC\u78EC\u792A\u7A37\u7A62\u7BE6\u7C2A\u7C50\u7D07\u7DD8\u7E5A\u7F8B\u7FD5\u7FF3\u8151\u81CD\u8317\u83F4\u85EA\u85FA\u8823\u895E\u89F4\u8A0C\u8A41\u8AA8\u8ACD\u8B10\u8CC1\u8D05\u8D73\u8E4A\u8E85\u8E91\u8EFB\u8F13\u9087\u914A\u91C9\u923F\u93B0\u9403\u95A8\u95AD\u9730\u9865\u9903\u9945\u9949\u99AD\u99E2\u9A6A\u9D26\u9E1E\u9EDD\u9F2C\u9F72
NGram.KANJI_2_36=\u4E9E\u4F86\u5011\u50B3\u5152\u5169\u5340\u5718\u5B78\u5BE6\u5BF6\u5C07\u5EE3\u61C9\u6230\u6703\u689D\u6A02\u6C23\u7063\u7368\u756B\u7576\u767C\u7A31\u7D93\u7E23\u7E3D\u81FA\u8207\u842C\u85DD\u865F\u8B49\u8B80\u8CFD\u908A\u9435\u95DC\u965D\u9AD4\u9EE8
NGram.KANJI_2_37=\u5480\u5580\u5C39\u67EF\u68B5\u6D85\u8521\u90B1
NGram.KANJI_2_38=\u4E1F\u4F96\u4FE0\u50F1\u5118\u522A\u5291\u52C1\u52DB\u52F3\u52F5\u52F8\u53B2\u55CE\u562F\u580A\u5862\u58AE\u58D8\u58DF\u58E9\u58EF\u5925\u593E\u599D\u5ABD\u5C62\u5EC2\u5EDA\u5EE2\u5F4E\u5F65\u6085\u6158\u61FC\u6200\u62CB\u633E\u6416\u6436\u6490\u64CB\u64E0\u64FA\u6514\u651C\u6524\u6558\u6583\u66B1\u66C6\u66C9\u66E0\u6A11\u6A1E\u6A38\u6A62\u6AB3\u6B16\u6B98\u6BBC\u6C2B\u6DDA\u6DE8\u6DEA\u6DFA\u6EEF\u6EFE\u6F32\u6F51\u6F5B\u700F\u71D2\u7210\u7246\u7260\u72A7\u72F9\u7375\u7378\u758A\u760B\u76DC\u76EA\u77DA\u77FD\u78DA\u7919\u797F\u79AA\u7A05\u7A4C\u7ACA\u7C72\u7D81\u7DDD\u7E31\u7E69\u7E6B\u7E73\u7E96\u7E9C\u81BD\u81C9\u81DF\u8259\u8277\u8396\u83A7\u8523\u8525\u860A\u863F\u8667\u87A2\u87F2\u881F\u883B\u89F8\u8B20\u8B74\u8B9A\u8C4E\u8C6C\u8C93\u8CEC\u8D0A\u8D0F\u8D95\u8E10\u8F4E\u8FAF\u8FF4\u905E\u9072\u9081\u908F\u91AC\u91C0\u91C1\u91D0\u921E\u9223\u9245\u929C\u92B3\u92C1\u9336\u934A\u93C8\u9444\u9452\u947C\u947F\u9592\u95B1\u95C6\u95D6\u95E1\u95E2\u96DE\u9742\u978F\u984F\u9871\u98B1\u98C4\u99ED\u9A37\u9A45\u9A5F\u9AEE\u9B27\u9BCA\u9C77\u9D51\u9D5D\u9E79\u9E7C\u9E7D\u9EB5\u9EBC\u9F61\u9F63\u9F90\u9F9C
NGram.KANJI_3_1=\u5283\u7562\u7DEC\u88E1\u8F2F
NGram.KANJI_3_2=\u5009\u502B\u5049\u5075\u507D\u5091\u5098\u50B5\u50B7\u50BE\u5100\u5104\u511F\u518A\u525B\u5289\u5442\u5805\u589C\u58C7\u5922\u596A\u5A66\u5B6B\u5BE7\u5BE9\u5DBA\u5E63\u5E7E\u5FB9\u6163\u616E\u6176\u61B2\u61B6\u61F8\u639B\u63DA\u63EE\u640D\u64B2\u64C1\u64EC\u6557\u6575\u6607\u66AB\u68C4\u6A39\u6C96\u6CC1\u6E1B\u6E6F\u6E9D\u6EC5\u6F01\u6F64\u6FC3\u7058\u707D\u7344\u7642\u76E4\u7832\u790E\u7B46\u7D05\u7D0B\u7D14\u7D19\u7D1B\u7D39\u7D61\u7DB1\u7DCA\u7DD2\u7DE0\u7DE9\u7DEF\u7DF4\u7E2E\u7E3E\u8105\u8108\u81E8\u8266\u84CB\u84EE\u85A9\u885D\u88DC\u8972\u8A02\u8A0E\u8A13\u8A17\u8A2A\u8A34\u8A3A\u8A3C\u8A69\u8A73\u8A95\u8AA0\u8AA4\u8AB2\u8AC7\u8ACB\u8B00\u8B1B\u8B1D\u8B5C\u8C9D\u8C9E\u8CA2\u8CA8\u8CA9\u8CAB\u8CAC\u8CB7\u8CBF\u8CC0\u8CDE\u8CE2\u8CFC\u8D08\u8DE1\u8E8D\u8ECC\u8EDF\u8EF8\u8F14\u8F1D\u8F2A\u8F44\u9055\u9069\u9077\u907C\u90F5\u91DD\u9285\u92FC\u9326\u932F\u9375\u9396\u93AE\u93E1\u9451\u9589\u95A3\u9663\u9670\u9673\u96BB\u9801\u9802\u9803\u9806\u9808\u9810\u983B\u984D\u9858\u9867\u98EF\u98F2\u98FE\u990A\u99D0\u9A0E\u9A5A\u9B5A\u9CE5\u9DB4\u9E97\u9F8D
NGram.KANJI_3_3=\u543E\u5BEE\u5F18\u6590\u725F\u83C5\u85E9\u9E93
NGram.KANJI_3_4=\u5016\u53AD\u5606\u5629\u58BE\u5F14\u6065\u6144\u646F\u647A\u67F5\u6953\u6C3E\u6F2C\u6F97\u6FB1\u7169\u71E6\u71ED\u74BD\u79BF\u7A1C\u7A4E\u7AAF\u7CDE\u7D17\u7D43\u7E55\u7FA8\u807E\u8139\u8490\u8569\u856A\u87FB\u8A23\u8AB9\u8AE6\u8AFA\u8B2C\u8CD1\u91D8\u92F8\u9318\u96DB\u99B4\u9BC9\u9C2D\u9CF6\u9D61\u9DFA
NGram.KANJI_3_5=\u4E26\u4F75\u4FC2\u500B\u5074\u5099\u512A\u5225\u5247\u5275\u5287\u52D5\u52D9\u52DD\u52E2\u5354\u54E1\u554F\u5712\u57F7\u5831\u5834\u5BAE\u5C0E\u5C64\u5CA1\u5CF6\u5E2B\u5E79\u5EAB\u5F35\u5F37\u5F8C\u5FA9\u611B\u614B\u63A1\u63DB\u6642\u66F8\u6771\u696D\u6975\u69CB\u6A19\u6A4B\u6A5F\u6BBA\u6C7A\u6E2C\u6E96\u6F22\u70BA\u7121\u71B1\u7372\u73FE\u74B0\u7570\u76E3\u78BA\u7A2E\u7A4D\u7AF6\u7BC0\u7BC4\u7BC9\u7C21\u7D00\u7D04\u7D0D\u7D1A\u7D30\u7D42\u7D44\u7D50\u7D66\u7D71\u7DAD\u7DDA\u7DE8\u7E54\u7F85\u7FA9\u7FD2\u8056\u805E\u8077\u8208\u83EF\u8449\u8853\u885B\u88FD\u8907\u898B\u898F\u8996\u89AA\u8A08\u8A18\u8A2D\u8A31\u8A55\u8A5E\u8A66\u8A71\u8A72\u8A8C\u8A8D\u8A9E\u8ABF\u8AD6\u8AF8\u8B58\u8B70\u8B77\u8CA0\u8CA1\u8CB4\u8CBB\u8CC7\u8CEA\u8ECA\u8ECD\u8F03\u8F09\u8F38\u8FB2\u9023\u9031\u9032\u904A\u904B\u904E\u9054\u9060\u9078\u907A\u9084\u9280\u9577\u9580\u958B\u9593\u9678\u967D\u968A\u968E\u969B\u96E2\u96E3\u96F2\u96FB\u97D3\u97FF\u9805\u9818\u982D\u984C\u985E\u98A8\u98DB\u9928\u99AC\u9BAE
NGram.KANJI_3_8=\u5F6B\u6C4E\u7B87\u8A70
NGram.KANJI_3_9=\u540B\u5B5C\u826E
NGram.KANJI_3_11=\u4F83\u4FF8\u51CB\u52BE\u53F1\u548B\u558B\u5CB1\u5D69\u5F3C\u620E\u621F\u64E2\u67DA\u6854\u69CC\u6A35\u6C8C\u6E1A\u6F15\u6FE0\u717D\u7252\u7AFA\u82D3\u83DF\u8431\u9041\u9149\u9798
NGram.KANJI_3_12=\u4ED5\u55E3\u572D\u57A3\u587E\u5983\u5A9B\u5C90\u5E61\u672D\u6960\u6F5F\u72D9\u72E9\u757F\u7949\u7950\u7E82\u7FCC\u82B8\u90B8\u91DC\u961C\u9B45
NGram.KANJI_3_13=\u55AB\u6249\u643E\u6841\u68B1\u725D\u7B8B\u7C95\u7E1E\u7F36\u8A03\u8A6B\u8E74\u95A4
NGram.KANJI_3_15=\u50AD\u50D1\u5132\u51F1\u55AC\u5617\u5687\u584A\u59EA\u5B30\u5BF5\u5C0B\u5C4D\u5EDF\u6182\u61A4\u64AB\u64FE\u66A2\u6897\u694A\u69CD\u6B3D\u6BC0\u6D29\u6F38\u7015\u7149\u71C8\u723A\u7336\u7345\u755D\u76C3\u78A9\u798D\u7AAE\u7DFB\u7E2B\u7F75\u7F77\u81E5\u834A\u852D\u85CD\u8755\u8A3B\u8A54\u8AE7\u8B02\u8B39\u8CAA\u8CE6\u8DA8\u8E5F\u8F5F\u905C\u912D\u919C\u92D2\u932B\u937E\u9418\u9583\u9812\u985B\u9905\u99B3\u99C1\u99D5\u9A30\u9CF3\u9D3B\u9D6C
NGram.KANJI_3_16=\u6D6C\u72FD\u77A5\u8956\u9C0D
NGram.KANJI_3_18=\u5919\u5F4A\u6063\u63AC\u649A\u6715\u6AD3\u71D0\u758B\u834F\u85F7\u88DF\u8F61\u93D1\u98F4\u9D60
NGram.KANJI_3_19=\u4F50\u7DB2\u962A
NGram.KANJI_3_22=\u5E96\u75D4\u91C6
NGram.KANJI_3_23=\u5E9A\u6C40\u821C\u839E\u8FED\u9EDB
NGram.KANJI_3_27=\u5F01\u66DC
NGram.KANJI_3_29=\u5023\u5208\u531D\u536F\u53E9\u54C9\u598A\u59BE\u5A20\u5D6F\u5DF3\u66C7\u66D6\u66F3\u6775\u6A3D\u6ADB\u6B86\u6C72\u6E25\u73EA\u7435\u760D\u7656\u7825\u78D0\u7A14\u7A6B\u7B20\u7BE0\u7CF8\u7DAC\u7DBB\u7DBE\u80E4\u80F4\u837B\u8466\u8568\u867B\u8A63\u91E7\u9320\u935B\u9591\u965B\u98E2\u990C\u9913\u9BAB
NGram.KANJI_3_30=\u60B6\u8AD2\u8CC2\u9237\u9328\u934D\u9397\u9830
NGram.KANJI_3_31=\u4FB6\u50D5\u51CD\u559A\u55AA\u5674\u5857\u585A\u5875\u58B3\u596E\u59E6\u5A41\u5D50\u5E25\u5E33\u5F59\u61C7\u61F2\u6368\u6383\u65AC\u68DF\u68F2\u6A3A\u6B04\u6DBC\u6DF5\u6E26\u6E4A\u6E67\u6F54\u6F70\u6FC1\u6FEB\u7159\u727D\u7652\u77EF\u78EF\u798E\u7A40\u7AAA\u7BE4\u7C60\u7CE7\u7CFE\u7D21\u7D33\u7D5E\u7D79\u7DB4\u7DBF\u7E1B\u7E8F\u7F70\u814E\u816B\u8178\u819A\u84BC\u85A6\u865C\u8766\u8A1F\u8A50\u8A60\u8A6E\u8A87\u8A98\u8AB0\u8ADC\u8AED\u8AEE\u8B0E\u8B19\u8CA7\u8CAF\u8CB8\u8CBC\u8CC3\u8CC4\u8CCA\u8CDC\u8CE0\u8CED\u8ED2\u8F29\u8F3F\u91E3\u920D\u9234\u925B\u9298\u9310\u934B\u958F\u95A5\u9727\u97FB\u9811\u984E\u98FC\u98FD\u99D2\u99FF\u9B31\u9BE8\u9C57\u9CE9\u9CF4\u9D28\u9DF9
NGram.KANJI_3_32=\u4E1E\u502D\u51A5\u5321\u58EC\u5A3C\u5BC5\u5CE8\u61A9\u620A\u65A1\u6714\u6853\u6893\u6C50\u6C5D\u7436\u745A\u745B\u773A\u7941\u7947\u8543\u865E\u8C5A\u914B\u99A8\u9AB8
NGram.KANJI_3_35=\u4E99\u5BA5\u5DFD\u608C\u60C7\u60DA\u6190\u61A7\u6753\u6777\u6787\u6B4E\u6F23\u6FE1\u6FEF\u7337\u7827\u786F\u7893\u7ABA\u7B94\u7BB8\u7C3E\u7D62\u7E6D\u80B1\u81BF\u81C6\u821B\u82E7\u83F0\u84D1\u86ED\u8888\u8B01\u8B04\u8F4D\u9291\u92E4\u932E\u9354\u936C\u939A\u9957\u9AED\u9BAA\u9BAD\u9BD6\u9BDB\u9C3B\u9D1B
NGram.KANJI_3_36=\u50C5\u53E2\u5EE0\u65BC\u70CF\u723E\u7D10\u7D9C\u806F\u8607\u862D\u8A0A\u8AFE\u8CD3\u9019\u9813\u9B6F
NGram.KANJI_3_37=\u4EA8\u4F3D\u5384\u5EFF\u60DF\u66DD\u6E5B\u8087\u82D1\u8FE6\u9640\u9E9F
NGram.KANJI_3_38=\u5147\u525D\u5678\u617E\u6372\u79A6\u8ABC\u92EA\u9438\u9817
NGram.KANJI_4_0=\u6D3C\u718F\u74EE\u8712
NGram.KANJI_4_9=\u4F84\u54C6\u5565\u68F1\u6D82\u83C7
NGram.KANJI_4_10=\u4FE9\u4FED\u51FF\u523D\u5300\u5364\u538C\u5450\u5455\u545C\u54D1\u54D7\u5578\u56A3\u58F6\u592F\u5CE6\u5D2D\u5E90\u6073\u607C\u60EB\u61D2\u62E2\u62E3\u631A\u6320\u6323\u6361\u63B7\u63B8\u63BA\u6405\u65A9\u65F7\u6619\u6655\u67A3\u67E0\u6805\u6808\u6866\u6868\u6869\u6A71\u6BE1\u6C79\u6CA5\u6CDE\u6DA4\u6DA7\u6DA9\u6E85\u70DB\u70E6\u70EB\u7115\u724D\u7410\u759F\u75AE\u75EA\u75F9\u762B\u763E\u76B1\u77EB\u783E\u79C3\u7A8D\u7A9C\u7B5D\u7BF1\u7EC5\u7ED2\u7EDE\u7EE3\u7EF7\u7EF8\u7EFD\u7F00\u7F0E\u7F15\u7F1A\u7F20\u7F24\u7F28\u7FA1\u7FD8\u8038\u803B\u804B\u80AE\u817B\u82C7\u8327\u835E\u8367\u83BA\u8424\u864F\u8681\u8682\u8715\u8717\u8721\u8747\u874E\u8845\u886C\u889C\u88E4\u89C5\u8BB6\u8BB9\u8BC0\u8BC5\u8BE1\u8BEB\u8BEC\u8BF5\u8C0E\u8C1A\u8D2E\u8D31\u8D43\u8D4E\u8D58\u8F67\u8F7F\u9489\u9499\u949D\u94A0\u94A5\u94AE\u94BE\u94D0\u94DB\u94F2\u9508\u950C\u951A\u9525\u952D\u952F\u9530\u953B\u9540\u9550\u9570\u9576\u95F0\u960E\u9668\u96CF\u97E7\u9885\u988A\u98A4\u9965\u9975\u997A\u997F\u9985\u998D\u998F\u9A6E\u9A6F\u9A74\u9A79\u9A7C\u9A82\u9A87\u9CA4\u9CC4\u9CCD\u9CD6\u9E20\u9E25\u9E35\u9E3D\u9E45\u9E49\u9E4A\u9E66
NGram.KANJI_4_16=\u576F\u579B\u6345\u78B4\u79EB\u79F8
NGram.KANJI_4_17=\u4E13\u4E1A\u4E1C\u4E24\u4E25\u4E2A\u4E3E\u4E49\u4E50\u4E66\u4E9A\u4EA7\u4EBF\u4ECE\u4EEC\u4EF7\u4F17\u4F20\u5170\u5173\u519B\u51B3\u51E4\u51FB\u5219\u521B\u522B\u529E\u52A1\u52A8\u52BF\u534F\u5355\u536B\u5386\u53BF\u53D1\u53D8\u542F\u5458\u54CD\u56E2\u56ED\u56F4\u56FE\u573A\u5904\u590D\u5934\u5B81\u5B9E\u5BF9\u5BFC\u5C14\u5C9B\u5E26\u5E7F\u5E94\u5F00\u5F20\u5F3A\u603B\u6218\u65E0\u65F6\u663E\u672F\u6743\u6784\u6807\u6C14\u6C49\u707E\u70ED\u73AF\u73B0\u7535\u76D1\u786E\u79CD\u79EF\u7B80\u7C7B\u7EA2\u7EA6\u7EA7\u7EAA\u7EBF\u7EC4\u7EC7\u7ED3\u7EDF\u7EE7\u7EED\u7EF4\u7F16\u7F57\u804C\u8054\u817E\u8282\u82CF\u83B7\u8425\u89C1\u89C2\u89C4\u89C6\u8BA1\u8BA4\u8BAE\u8BAF\u8BB0\u8BB8\u8BBA\u8BBE\u8BC1\u8BC4\u8BD1\u8BDD\u8BE5\u8BED\u8BF4\u8C03\u8D22\u8D23\u8D28\u8D39\u8D44\u8D5B\u8F66\u8F6C\u8F83\u8FBE\u8FC7\u8FD0\u8FD8\u8FD9\u8FDB\u8FDE\u9009\u94C1\u957F\u95E8\u95EE\u95F4\u95FB\u961F\u9633\u9645\u9646\u96BE\u9879\u9884\u9886\u9898\u98CE\u9A6C\u9F99
NGram.KANJI_4_18=\u51DB\u67B7
NGram.KANJI_4_22=\u4FA5\u545B\u5499\u5520\u5570\u56F1\u5A76\u5C96\u60AF\u60ED\u618B\u61A8\u62A0\u62A1\u62E7\u6363\u6390\u63B0\u6400\u6402\u6512\u6748\u70C1\u732C\u765E\u7663\u76CF\u7741\u781A\u7980\u79C6\u79FD\u7AA5\u7B0B\u7B8D\u7BA9\u7BAB\u7BD3\u7CAA\u7EAB\u7ECA\u7EE2\u7F2D\u7F30\u8110\u8113\u81CA\u835A\u8360\u84D6\u852B\u87E5\u8869\u8A8A\u8BA5\u8BF2\u8C05\u8C12\u8D30\u8D4A\u8D61\u8DF7\u8E6D\u8E8F\u8F95\u8F99\u8FAB\u94B3\u94C6\u94E3\u9504\u954A\u9563\u95FA\u9893\u9981\u9992\u9AA1\u9CAB\u9E2F\u9E33\u9EB8
NGram.KANJI_4_24=\u4E22\u4E8F\u4F1E\u4FA3\u5151\u517D\u51BB\u51D1\u5220\u529D\u52CB\u5367\u5389\u5395\u53E0\u53F9\u5413\u548F\u5524\u575E\u575F\u5784\u5792\u57A6\u57AB\u58F3\u5986\u5988\u5A04\u5A07\u5BA0\u5C18\u5C82\u5DE9\u5E10\u5E1C\u5F2F\u60E9\u6124\u629B\u6321\u6324\u635E\u63FD\u6401\u644A\u6491\u655B\u658B\u6635\u67AB\u67DC\u680B\u692D\u6984\u6A31\u6B7C\u6BD9\u6C22\u6CA6\u6CA7\u6CEA\u6CFB\u6CFC\u6D46\u6D47\u6D4A\u6D51\u6DA1\u6E0A\u6E83\u6EE4\u6EE5\u6F9C\u6FD2\u70C2\u7237\u727A\u730E\u7574\u75AF\u7792\u7816\u7845\u78B1\u7A77\u7A91\u7A9D\u7AD6\u7B3C\u7B5B\u7CAE\u7EA4\u7EB1\u7EBA\u7ECE\u7ED1\u7EF0\u7EF3\u7F14\u7F1D\u7F34\u7F62\u8042\u806A\u80A0\u80A4\u80BE\u80BF\u80C0\u810F\u8138\u8231\u8270\u829C\u82CD\u8350\u83B9\u841D\u8574\u8680\u8BB3\u8BBC\u8BBD\u8BC8\u8BF1\u8BFD\u8C0A\u8C0D\u8C1C\u8C24\u8C26\u8C2C\u8C2D\u8C34\u8D1E\u8D2C\u8D3C\u8D41\u8D42\u8D4C\u8D50\u8D5A\u8F69\u8F88\u8F90\u8FA9\u915D\u9171\u9493\u949E\u94A7\u94A9\u94BB\u94C3\u94C5\u94DD\u94F8\u9505\u9510\u9523\u9524\u95EF\u95F7\u95F9\u9600\u9610\u96F3\u97F5\u987D\u9882\u9888\u9896\u98D8\u9971\u9972\u9976\u997C\u9A84\u9A86\u9A8F\u9A97\u9A9A\u9AA4\u9CB8\u9CDE\u9E26\u9E43\u9E64\u9E70\u9F7F\u9F9F
NGram.KANJI_4_28=\u534E\u62A5\u7ECF\u7F51
NGram.KANJI_4_34=\u4E34\u4E3D\u4E4C\u4E54\u4E60\u4E61\u4E70\u4EB2\u4EC5\u4EEA\u4F18\u4F1F\u4F24\u4F26\u4FA7\u50A8\u513F\u5174\u517B\u518C\u519C\u51B5\u51CF\u5218\u521A\u5267\u52B3\u5356\u5382\u5385\u538B\u53A6\u5434\u5706\u5723\u5757\u575A\u575B\u575D\u5907\u591F\u593A\u5956\u5B59\u5BA1\u5BAB\u5BBD\u5BBE\u5BFB\u5C42\u5C81\u5E01\u5E08\u5E86\u5E93\u5F02\u5F39\u5F52\u5F55\u5F7B\u6000\u6001\u6076\u620F\u6237\u6267\u6269\u626C\u62A2\u62A4\u62DF\u62E5\u62E9\u6325\u635F\u6362\u6444\u6653\u6682\u6740\u6742\u6768\u6781\u6811\u6837\u6865\u68C0\u6B22\u6BC1\u6BD5\u6C47\u6C9F\u6CAA\u6CFD\u6D4B\u6DA8\u6E10\u6EE1\u6EE8\u706D\u7075\u70DF\u7231\u739B\u7597\u76D6\u76D8\u77FF\u7801\u7840\u79BB\u7A33\u7ADE\u7B14\u7B7E\u7CA4\u7D27\u7EB3\u7EBD\u7EC3\u7EC6\u7EC8\u7ECD\u7ED5\u7ED9\u7EDC\u7EDD\u7EE9\u7EFC\u7EFF\u7F13\u7F29\u8083\u80DC\u8111\u814A\u8230\u827A\u8363\u836F\u8428\u84DD\u867D\u8865\u88AD\u89C8\u8BA2\u8BA8\u8BA9\u8BAD\u8BB2\u8BBF\u8BC6\u8BCD\u8BD5\u8BEF\u8BF7\u8BF8\u8BFA\u8BFB\u8C08\u8D1D\u8D1F\u8D21\u8D25\u8D27\u8D2D\u8D2F\u8D35\u8D38\u8DC3\u8F6E\u8F6F\u8F7B\u8F7D\u8F86\u8F91\u8F93\u8F96\u8FB9\u8FBD\u8FC1\u8FDC\u8FDD\u9002\u9057\u90BB\u90D1\u91CA\u9488\u949F\u94A2\u94B1\u94F6\u9500\u9526\u9547\u9614\u9634\u9635\u9636\u9648\u9655\u9669\u9690\u97E9\u9875\u9876\u987A\u987B\u987E\u987F\u9891\u989D\u98DE\u9986\u9A7B\u9A8C\u9C81\u9C9C\u9F50
NGram.KANJI_4_39=\u4E1B\u4E1D\u4E27\u4EA9\u4ED1\u4ED3\u4F2A\u4FA6\u4FA8\u503A\u503E\u507F\u5188\u51AF\u51C0\u51C9\u51ED\u51EF\u5242\u5251\u52B2\u5362\u53A2\u5415\u5417\u5428\u55B7\u5760\u5899\u5939\u594B\u5987\u5A31\u5A74\u5BAA\u5C1D\u5C7F\u5C97\u5CAD\u5E05\u5E2E\u5E99\u5E9E\u5E9F\u5F03\u5FC6\u5FE7\u60AC\u60CA\u60EF\u626B\u6270\u629A\u62E6\u62E8\u6446\u6447\u654C\u67AA\u680F\u6863\u68A6\u6C64\u6D01\u6D53\u6D9D\u6DA6\u6E14\u6E17\u6EDA\u6EE9\u707F\u70BC\u70E7\u7275\u72B9\u72EE\u72F1\u743C\u7545\u76D0\u7855\u7978\u7B79\u7BEE\u7EA0\u7EAC\u7EAF\u7EB2\u7EB5\u7EB7\u7EB8\u7EB9\u7ED8\u7EEA\u7EF5\u7F05\u7F06\u7F18\u7F5A\u80C1\u80F6\u8109\u8206\u8273\u82F9\u8346\u8361\u83B2\u8427\u8651\u867E\u8854\u89C9\u8BC9\u8BCA\u8BD7\u8BDA\u8BDE\u8BE2\u8BE6\u8BFE\u8C01\u8C0B\u8C10\u8C13\u8C22\u8C23\u8C28\u8C31\u8D24\u8D26\u8D29\u8D2A\u8D2B\u8D34\u8D37\u8D3A\u8D3E\u8D3F\u8D4B\u8D4F\u8D54\u8D56\u8D5E\u8D60\u8D62\u8D75\u8D76\u8D8B\u8F68\u8F70\u8F74\u8F85\u8F89\u8FC8\u8FDF\u900A\u9012\u903B\u9093\u90AE\u917F\u9274\u94A6\u94DC\u94ED\u94FA\u94FE\u9501\u950B\u9519\u9521\u952E\u955C\u95EA\u95ED\u95F2\u95F8\u95FD\u9601\u9605\u9647\u96B6\u96FE\u9877\u9881\u9887\u9897\u989C\u98A0\u996D\u996E\u9970\u9A70\u9A71\u9A73\u9A76\u9A7E\u9A91\u9C7C\u9E1F\u9E21\u9E23\u9E2D\u9E3F\u9E4F\u9F84
NGram.KANJI_5_10=\u5239\u8EAF
NGram.KANJI_5_11=\u51C4\u8471
NGram.KANJI_5_12=\u6DC0\u7C98
NGram.KANJI_5_13=\u5631\u5815\u8695
NGram.KANJI_5_14=\u4E71\u4FA0\u5265\u52B1\u5374\u53A8\u53D9\u58EE\u5BDD\u5BFF\u5C3D\u5C4A\u5CE1\u5F25\u5F84\u604B\u60A6\u60E7\u60E8\u631F\u636E\u643A\u663C\u664B\u67A2\u6816\u697C\u6B8B\u6BB4\u6D45\u6E7F\u6EDE\u6F5C\u706F\u7089\u72ED\u732A\u732B\u76D7\u793C\u7977\u7A0E\u7A83\u80C6\u811A\u8131\u82A6\u830E\u848B\u865A\u866B\u86EE\u89E6\u8A89\u8DF5\u8E0A\u8E2A\u8F9E\u9065\u968F\u9759\u9EA6
NGram.KANJI_5_18=\u601C\u75D2
NGram.KANJI_5_26=\u4E07\u4E0E\u4E89\u4F1A\u4F53\u515A\u5185\u5199\u533A\u533B\u53C2\u53CC\u53F7\u58F0\u5965\u5B66\u5B9D\u5C06\u5C5E\u5F53\u62C5\u6570\u65AD\u65E7\u6761\u6765\u6A2A\u6B27\u6CA1\u6E29\u6E7E\u70B9\u72B6\u72EC\u732E\u753B\u79F0\u88C5\u9EC4
NGram.KANJI_5_29=\u693F\u82EB
NGram.KANJI_5_34=\u53F6\u6D9B\u83B1
NGram.KANJI_5_39=\u5C61\u788D
NGram.KANJI_6_0=\u4E10\u4E52\u4EC6\u4F88\u4FD0\u51F3\u533E\u53ED\u53EE\u5406\u541D\u5429\u5435\u5440\u5490\u5495\u54B1\u54C4\u54FC\u557C\u55D3\u5669\u56E4\u5777\u5992\u59E8\u5B7D\u5BDE\u5BE5\u5C79\u5C94\u5DCD\u5E18\u5E1A\u5E54\u5FF1\u604D\u6064\u60F6\u6127\u6177\u6233\u6252\u625B\u6273\u6296\u62C2\u62C7\u62F4\u638F\u6396\u63E3\u63EA\u6413\u6479\u64A9\u64C2\u659F\u667E\u6760\u6845\u6963\u6A90\u6B83\u6C13\u6C5E\u6D8E\u6D95\u6DCC\u6ED4\u6F13\u6F3E\u6FA1\u7076\u70D8\u710A\u71CE\u7239\u72E1\u73B7\u7599\u759A\u75A4\u75CA\u7629\u7682\u76C5\u76EF\u778E\u77AA\u787C\u7889\u788C\u78BE\u79E7\u7A96\u7A98\u7B77\u7C7D\u7CB1\u7D0A\u7D6E\u7F94\u7FCE\u8116\u814B\u814C\u819B\u828D\u82DF\u8301\u83E0\u85D5\u8611\u86A3\u8708\u8822\u8C4C\u8DB4\u8DEA\u8E42\u8E66\u8E72\u8EBA\u901B\u9157\u970E\u97ED
NGram.KANJI_6_3=\u62FC\u88D4\u9B4F
NGram.KANJI_6_9=\u4ED7\u4F63\u4FCF\u5018\u50BB\u50F5\u5154\u5201\u522E\u5254\u527F\u5306\u5462\u5492\u5496\u54A8\u54AA\u554A\u5561\u5564\u5566\u5885\u5938\u5AC2\u5AE9\u5CED\u5F64\u6084\u608D\u60A8\u60D5\u61C2\u61C8\u6254\u626F\u62AC\u6346\u634D\u640F\u6454\u6487\u6495\u64D2\u6746\u6789\u68B3\u68F5\u695E\u6986\u6995\u69A8\u6A44\u6AAC\u6B79\u6C28\u6C2E\u6CF5\u6DE4\u6E34\u6E3A\u6E89\u6F29\u70AB\u70AC\u7130\u715E\u7184\u71AC\u7238\u7281\u72E0\u74E3\u74F7\u7529\u7578\u761F\u7626\u76D4\u775B\u7779\u7784\u77BB\u780C\u780D\u7838\u7898\u78C5\u78F7\u7AED\u7B28\u7BE1\u7C07\u7CD5\u7CD9\u7CEF\u7F38\u800D\u8084\u809A\u8165\u816E\u832B\u8334\u840D\u8774\u886B\u888D\u88D9\u88F9\u8C41\u8D81\u8D9F\u8E22\u8E29\u8EB2\u8F9C\u9165\u918B\u9631\u964B\u964C\u9661\u9709\u9739\u9776\u9AD3\u9ED4
NGram.KANJI_6_10=\u4E53\u5582\u5600\u6342\u7B06
NGram.KANJI_6_11=\u5288\u543C\u5475\u5486\u54EE\u5598\u56BC\u5962\u5A36\u5A9A\u5B75\u5BA6\u5C38\u5C4E\u5F8A\u5F98\u627C\u62CC\u62D7\u63C9\u6930\u6954\u69D0\u6BEF\u6C90\u6CBD\u6CBE\u6F31\u6F88\u70D9\u7329\u75BC\u75F0\u7737\u77D7\u7B19\u7FB9\u803F\u80D6\u813E\u81C0\u8205\u8309\u83BD\u846B\u8517\u868C\u8759\u8815\u8859\u8B6C\u8E81\u8EAC\u90A2\u9698\u9B44
NGram.KANJI_6_12=\u722C\u7FD4
NGram.KANJI_6_16=\u5228\u5315\u542E\u54CE\u5509\u5527\u5543\u55B3\u55E1\u5636\u568E\u5FFF\u61E6\u6376\u642A\u6726\u74E4\u76F9\u7736\u7BD9\u8019\u80F0\u80F3\u812F\u818A\u8200\u8214\u8638\u869C\u86C0\u86C6\u86D4\u87C6\u88B1\u8902\u8C7A\u8E4B\u9119
NGram.KANJI_6_18=\u67D2\u6ED3\u87C0\u87CB\u8DDB\u901E\u9163
NGram.KANJI_6_20=\u4F5B\u52D2\u54C8\u62FF\u66FC\u6D59\u704C\u7586\u9ECE
NGram.KANJI_6_21=\u4E48\u4EFF\u4F19\u4FF1\u5021\u5077\u5195\u5212\u5269\u5401\u541E\u5427\u54EA\u5587\u558A\u55BB\u566A\u573E\u574E\u5783\u57AE\u584C\u58E4\u5960\u5976\u59CA\u5A1C\u5DE2\u5F99\u600E\u6015\u6263\u626D\u6293\u62C6\u62D6\u62EF\u62F1\u6316\u632A\u6380\u6389\u63D2\u641E\u64C5\u64CE\u65F1\u6664\u6735\u6770\u67EC\u6846\u684C\u68AD\u6B47\u6B49\u6B67\u6C1B\u6C27\u6C2F\u6C5B\u6C89\u6DF9\u6EAF\u70AE\u70E4\u731C\u7334\u73BB\u7470\u76FC\u788E\u789F\u78B0\u78B3\u7A0D\u7A3B\u7A57\u7CB9\u7F69\u8335\u8354\u84BF\u8DCC\u8DD1\u904F\u90A8\u9189\u9677\u9738\u978B
NGram.KANJI_6_22=\u5162\u53E8\u542D\u5501\u552C\u5639\u563F\u56B7\u6043\u60B4\u6194\u61CA\u634E\u63CD\u6414\u64AC\u6DAE\u6E43\u6F66\u7095\u7316\u733E\u7728\u7830\u78D5\u7ABF\u7FE9\u8018\u80EF\u8198\u8693\u86AA\u86AF\u874C\u8783\u879F\u8892\u8E6C
NGram.KANJI_6_23=\u4FD8\u4FEF\u501A\u5085\u5180\u526A\u5323\u54ED\u5634\u56CA\u58A9\u58F9\u5955\u5978\u59DA\u5A49\u5B55\u5BC7\u5BE8\u5D4C\u5E62\u6467\u64BC\u6500\u655E\u6572\u658C\u6670\u68CD\u68D5\u68E0\u6912\u6A0A\u6BB7\u6C9B\u6D3D\u6DC6\u6E23\u6F8E\u7011\u7092\u714C\u73AB\u7405\u7624\u76D2\u7960\u79C9\u7A20\u7BF7\u7F50\u804A\u8086\u81C2\u8292\u82DE\u852C\u857E\u859B\u8760\u8C6B\u8DBE\u8E48\u8F9F\u96A7
NGram.KANJI_6_25=\u4E8E\u5DF2\u5FB7\u7AD9
NGram.KANJI_6_28=\u4E58\u4ECD\u4EFD\u4F30\u4F60\u4F69\u503C\u5047\u51B0\u51F0\u5361\u5377\u53E6\u54E5\u552E\u5708\u5740\u5761\u57C3\u5821\u589E\u5979\u59C6\u5B69\u5B83\u5E15\u5E76\u5F17\u5F88\u6208\u622A\u624E\u627E\u62D4\u62DC\u63ED\u641C\u6536\u6548\u65C1\u665A\u6668\u67E5\u6B65\u6BCF\u6C61\u6CDB\u6D4E\u6D89\u6DB5\u6E38\u6EAA\u6FB3\u70B8\u745F\u7538\u7A97\u7F3A\u7F55\u805A\u8258\u827E\u82AC\u8303\u83F2\u8482\u85CF\u8DDF\u903E\u9080\u970D\u9760\u9ED1\u9ED8
NGram.KANJI_6_29=\u634F\u6518\u7B50\u809B
NGram.KANJI_6_30=\u54A7\u57C2\u5AB3\u60CB\u6886\u8378\u85D0\u8671
NGram.KANJI_6_32=\u5080\u5121\u51A4\u54AC\u55DC\u592D\u5DEB\u6292\u68D8\u69B4\u6A59\u6E24\u7FC5\u80DA\u8180\u86DB\u8700\u8DCB\u9761
NGram.KANJI_6_34=\u4E30\u51E0\u542C\u613F
NGram.KANJI_6_35=\u4E56\u547B\u55FD\u5C41\u606C\u6115\u6CAE\u7119\u795F\u7CDC\u86C9\u86F9\u8713\u873B\u8757\u8925\u892A\u96F9
NGram.KANJI_6_37=\u51B2\u5308\u5398\u54B8\u59DC\u5C4F\u5D14\u5F6D\u60E0\u6241\u6350\u699C\u6BEB\u6C6A\u6CC4\u6DEE\u6F58\u6F6D\u7199\u77EE\u7ADF\u8058\u820D\u8212\u8389\u8587\u884D\u8881\u8FA8\u8FF9\u96D5
NGram.KANJI_6_39=\u574F\u6251\u6302
NGram.KANJI_7_0=\u52FA\u5544\u60F0\u6994\u86A4\u86E4
NGram.KANJI_7_3=\u4E59\u4E7E\u4EAD\u4EF0\u4EF2\u4F0F\u4F10\u4FAF\u4FCA\u500D\u501F\u5076\u508D\u50E7\u5112\u5146\u5192\u51AC\u51DD\u51FD\u5200\u5237\u524A\u52A3\u52C3\u52C7\u52DF\u5351\u5352\u5353\u5378\u537F\u53E5\u5439\u54FA\u574A\u5782\u57CB\u5893\u58C1\u5915\u5937\u5949\u5951\u5974\u59B9\u5A18\u5A5A\u5ACC\u5B54\u5B5D\u5B64\u5B8F\u5BBF\u5BD2\u5C3A\u5C6F\u5CB3\u5D07\u5DE7\u5E84\u5E8A\u5F26\u5F69\u5F70\u5F90\u5FAA\u5FCD\u6012\u6016\u602A\u60A0\u60B2\u60BC\u6148\u6162\u6170\u6291\u6298\u62AB\u62BC\u62BD\u62D2\u62D3\u62D8\u62F3\u6311\u638C\u6398\u63E1\u642C\u6458\u64A4\u654F\u656C\u659C\u65E2\u65E8\u65EC\u6606\u6614\u6676\u6691\u6696\u66F9\u6749\u676F\u679A\u679D\u67CF\u67D4\u67F1\u67F3\u67F4\u6817\u6842\u6843\u6851\u68A8\u68CB\u68D2\u6B20\u6B32\u6BBF\u6C57\u6C88\u6CCA\u6D17\u6D1E\u6D69\u6D6E\u6D78\u6DE1\u6DFB\u6E58\u6EB6\u6F0F\u6F20\u7070\u708E\u70AD\u7126\u718A\u71C3\u7267\u72C2\u731B\u7384\u73A9\u73CD\u7434\u75AB\u75DB\u76C6\u76FE\u773C\u7891\u78C1\u795D\u7965\u79D2\u79DF\u79E6\u7A00\u7B11\u7B51\u7B54\u7C89\u7C92\u7CD6\u7D2B\u7F8A\u7FBD\u7FFC\u8010\u80A5\u80CE\u8150\u8179\u819C\u8247\u829D\u82B3\u82D7\u82E6\u8302\u8336\u8352\u83CA\u83CC\u83DC\u845B\u846C\u84B2\u84B8\u84C4\u8584\u864E\u86C7\u8861\u8863\u8870\u888B\u8896\u88D5\u8986\u8C46\u8DA3\u8E0F\u8F9B\u8FC5\u8FEB\u8FF7\u9003\u9006\u902E\u9042\u9063\u90ED\u963B\u9676\u96EA\u9756\u9B3C\u9B42\u9F3B
NGram.KANJI_7_6=\u4E01\u4E03\u4E45\u4E5D\u4E88\u4E92\u4EA1\u4ECB\u4EE4\u4F01\u4F0A\u4F2F\u4F3C\u4F4E\u4F4F\u4F55\u4F8B\u4F9D\u4FBF\u4FEE\u505C\u50CF\u516B\u516D\u5175\u5177\u5178\u5207\u520A\u5224\u526F\u529F\u52A9\u5343\u5348\u535A\u5370\u53BB\u53CB\u53F3\u5409\u542B\u544A\u547C\u5584\u5747\u5802\u590F\u592B\u5931\u5947\u597D\u5A01\u5A92\u5B63\u5B8C\u5B97\u5BA2\u5BA3\u5BA4\u5BB3\u5BB9\u5BC6\u5BCC\u5BDF\u5C04\u5C1A\u5C45\u5C4B\u5CB8\u5DE6\u5E0C\u5E1D\u5E2D\u5E55\u5E8F\u5E95\u5E97\u5EA7\u5EB7\u5EF6\u5F8B\u5FAE\u5FC5\u5FD7\u5FF5\u601D\u6025\u606F\u60F3\u611F\u623F\u6253\u6279\u627F\u6295\u6297\u62EC\u6388\u6392\u63F4\u6545\u6551\u6574\u6599\u65C5\u65E9\u6613\u6620\u6625\u666E\u666F\u66B4\u66F4\u670D\u671B\u6728\u672B\u6751\u677E\u67B6\u6838\u6839\u6848\u68EE\u690D\u6982\u6A21\u6B4C\u6B62\u6B66\u6BB5\u6BCD\u6C0F\u6C38\u6C42\u6CBF\u6CE2\u6CE8\u6D0B\u6D3E\u6D88\u6DF1\u6E05\u6E56\u706B\u7167\u7206\u7236\u7247\u7387\u7530\u7537\u7559\u7565\u7591\u75C5\u767B\u767D\u767E\u7687\u76DB\u76DF\u771F\u7763\u77ED\u7834\u79FB\u7A81\u7AE0\u7AEF\u7B56\u7B97\u7C4D\u7CBE\u7D20\u7D22\u7F72\u7FA4\u8001\u8003\u81F4\u822A\u826F\u82B1\u8349\u843D\u878D\u8857\u89D2\u8B66\u8C37\u8D70\u8D85\u8D8A\u8DB3\u8FF0\u8FFD\u9001\u901F\u90A3\u90A6\u914D\u91CE\u9632\u963F\u9644\u964D\u9664\u96C4\u96E8\u9752\u9769\u98DF
NGram.KANJI_7_7=\u4E09\u4E0A\u4E0B\u4E0D\u4E16\u4E3B\u4E8B\u4E8C\u4EE3\u4EE5\u4F4D\u4F5C\u4F7F\u5165\u5168\u516C\u5171\u51FA\u5206\u5229\u5236\u524D\u529B\u52A0\u5316\u5317\u5357\u539F\u53CA\u53F0\u5408\u540C\u540D\u548C\u5730\u57FA\u5916\u591A\u5929\u5B50\u5B9A\u5BB6\u5C0F\u5C71\u5DDE\u5DE5\u5E02\u5E73\u5EA6\u5EFA\u5F0F\u6027\u6210\u6240\u6307\u653F\u6587\u65B0\u65B9\u660E\u6700\u6709\u671F\u672C\u6B21\u6B63\u6C11\u6CBB\u6CD5\u6D77\u7269\u7279\u7406\u751F\u7528\u7531\u754C\u76EE\u76F8\u793E\u79D1\u7ACB\u7B2C\u7B49\u7CFB\u8005\u80FD\u81EA\u82F1\u884C\u8868\u897F\u8981\u901A\u9053\u90E8\u90FD\u91CD\u9AD8
NGram.KANJI_7_9=\u4E4D\u4F36\u5319\u6A61\u6DCB\u7194
NGram.KANJI_7_11=\u4E5E\u4F43\u5026\u50FB\u515C\u5243\u5420\u5446\u54B3\u54BD\u553E\u55A7\u5703\u5984\u5AC9\u5B09\u5C51\u5DFE\u5ED3\u5F1B\u6055\u618E\u62D9\u65A7\u6652\u6977\u6EBA\u707C\u75D8\u79E4\u7AFF\u7B4F\u7CA5\u808B\u8098\u80B4\u8235\u82DB\u849C\u8549\u868A\u86FE\u8718\u914C
NGram.KANJI_7_12=\u4E08\u4E38\u4F8D\u50DA\u5203\u5256\u52C9\u52D8\u52FE\u5320\u533F\u5375\u53D4\u540F\u54E8\u56DA\u5806\u5996\u5999\u59A5\u59A8\u59FF\u5AE1\u5BB0\u5BF8\u5C09\u5C3F\u5C48\u5C65\u5D29\u5E06\u5E4C\u5EB5\u5EB6\u5EB8\u5F13\u5FCC\u5FD8\u6052\u606D\u609F\u60D1\u614E\u6247\u62B1\u6349\u64E6\u6577\u65ED\u6674\u6734\u67C4\u6850\u690E\u6A58\u6B3A\u6B89\u6C41\u6CBC\u6CCC\u6CF3\u6D74\u6DAF\u6DF3\u6ECB\u6F02\u6F84\u71E5\u7261\u7272\u72AC\u72FC\u733F\u7409\u755C\u76F2\u7720\u77AC\u77E2\u7802\u786B\u78E8\u7901\u7948\u79E9\u7A1A\u7A74\u7AE3\u7B4B\u7B52\u7BB1\u7C3F\u8015\u8096\u809D\u80A2\u80A9\u80AA\u80BA\u80F8\u8102\u810A\u8154\u8155\u8170\u817A\u81A8\u81ED\u820C\u8236\u82BD\u8305\u83E9\u83F1\u840C\u85FB\u8650\u8702\u8A93\u8E44\u8FB0\u9038\u9091\u90AA\u916C\u9175\u9177\u9685\u96C0\u96C7\u96CC\u97AD
NGram.KANJI_7_13=\u63D6\u803D
NGram.KANJI_7_16=\u602F\u7566
NGram.KANJI_7_18=\u634C\u7C38
NGram.KANJI_7_19=\u4E18\u4E73\u4E95\u4EAB\u4EC1\u4ED8\u4ED9\u4F11\u4F34\u4F38\u4F59\u4FB5\u4FC3\u4FD7\u5012\u5019\u5065\u50AC\u5144\u5145\u514D\u517C\u51A0\u51B7\u5211\u5238\u523A\u523B\u5272\u52E4\u5360\u5371\u539A\u541B\u5426\u5438\u5473\u54F2\u5510\u552F\u5531\u559C\u5609\u56F0\u56FA\u591C\u5948\u594F\u59BB\u59D3\u5B85\u5B87\u5B88\u5B99\u5B9C\u5BC4\u5BFA\u5C0A\u5C3E\u5CA9\u5D0E\u5DE1\u5DE8\u5DEE\u5DF1\u5E45\u5E78\u5E7B\u5E7C\u5EAD\u5EF7\u5F1F\u5F31\u5F79\u5F7C\u5F85\u5F92\u5FA1\u5FE0\u6050\u60A3\u6212\u62DB\u632F\u6355\u63A2\u63AA\u63CF\u642D\u6469\u64CD\u653B\u6563\u660C\u662D\u667A\u6697\u66FF\u6750\u675F\u677F\u6790\u67D3\u682A\u6885\u68B0\u6B8A\u6B96\u6BDB\u6C60\u6CB9\u6CC9\u6D25\u6D66\u6DB2\u6DF7\u6E21\u6ED1\u6F2B\u6F6E\u6FC0\u7235\u725B\u72AF\u7389\u7532\u7533\u756A\u75BE\u75C7\u76AE\u76CA\u7740\u786C\u7956\u7968\u796D\u7981\u79C0\u79C1\u79CB\u79D8\u7A3F\u7AE5\u7AF9\u7E41\u7F6A\u7FFB\u8089\u80CC\u80DE\u81E3\u821E\u8239\u82E5\u8328\u8377\u85E4\u8840\u88C1\u88C2\u8C6A\u8D64\u8DDD\u8FCE\u8FD4\u9000\u9014\u907F\u90CA\u90CE\u90E1\u9152\u9178\u9686\u9694\u969C\u9707\u9732\u9AA8\u9B54\u9E7F\u9EBB
NGram.KANJI_7_20=\u4E39\u4E43\u4EAE\u4F73\u504F\u505A\u51C6\u51CC\u52AA\u5339\u5347\u53EB\u53EC\u5448\u5766\u57F9\u5854\u585E\u58A8\u5B8B\u5C01\u5CF0\u5E72\u5EC9\u5F80\u5F81\u5FBD\u5FEB\u6069\u6211\u624D\u628A\u62B5\u62CD\u6309\u63A7\u64AD\u6566\u6597\u65CB\u65D7\u6628\u6717\u6731\u674E\u675C\u683D\u6881\u6B3E\u6BD2\u6C7D\u6C99\u6CE5\u6CF0\u6D1B\u6D2A\u70C8\u719F\u724C\u7259\u73E0\u73ED\u745E\u74E6\u7518\u751A\u7686\u770B\u7B26\u8033\u80A1\u80E1\u821F\u83AB\u8499\u8D74\u8DE8\u900F\u9010\u9047\u904D\u906D\u9675\u96C5\u96F6\u96F7\u9700\u9F13
NGram.KANJI_7_21=\u5764\u59D0\u5A03\u6062\u6108\u68C9\u7164\u79BE\u7BAD\u903C
NGram.KANJI_7_23=\u4EA5\u50B2\u532A\u5366\u543B\u54E9\u5632\u59D1\u5BB5\u5DF7\u5F6A\u5F6C\u5FFD\u6070\u6168\u61BE\u63A0\u63A9\u6478\u65A4\u68A7\u6A1F\u6CAB\u70F9\u711A\u723D\u7262\u72F8\u751C\u754F\u75B9\u76C8\u7709\u7897\u7CCA\u7F9E\u8299\u82AD\u82B9\u82D4\u8304\u84C9\u84EC\u854A\u85AF\u86D9\u8FA3\u9187\u97A0
NGram.KANJI_7_25=\u4E14\u4E5F\u4F46\u514B\u5176\u5230\u5373\u53EA\u540E\u5982\u5C3C\u5DF4\u6216\u62C9\u65AF\u66FE\u6B64\u6D32\u6D6A\u7BC7\u800C
NGram.KANJI_7_28=\u4E4E\u4E9B\u4EA6\u4EC0\u4FC4\u5403\u5957\u5C24\u6089\u6258\u67D0\u758F\u7FF0\u8D6B
NGram.KANJI_7_29=\u4FAE\u5944\u5A29\u6101\u62ED\u6328\u637B\u6666\u6687\u66AE\u673D\u6756\u67FF\u6813\u68A2\u699B\u7078\u708A\u7396\u7422\u7525\u75E2\u76BF\u7766\u77B3\u7A3C\u7A92\u819D\u81FC\u8237\u8338\u8511\u88F3\u8FC2
NGram.KANJI_7_32=\u4E11\u4F3A\u4F51\u5197\u51B6\u51F9\u52FF\u541F\u5507\u5589\u5993\u5A7F\u5AC1\u5B9B\u5BC2\u5BE1\u5F04\u5F0A\u5F27\u6020\u6028\u6068\u6094\u6109\u611A\u614C\u621A\u62B9\u62D0\u62F7\u62FE\u632B\u633D\u6367\u660F\u6627\u6643\u66D9\u674F\u6795\u67AF\u67D1\u6876\u68DA\u68FA\u6905\u69FD\u6A80\u6B6A\u6CB8\u6CE3\u6DD1\u6DEB\u6E9C\u6EA2\u6EF4\u6F06\u714E\u716E\u722A\u7280\u74A7\u752B\u75B2\u75D5\u75F4\u77AD\u77E9\u785D\u79BD\u7A3D\u7A9F\u7B1B\u7B95\u7C9F\u7CDF\u80C3\u8106\u817F\u818F\u81B3\u828B\u82A5\u82AF\u840E\u851A\u853D\u8776\u87F9\u8877\u8910\u8912\u8C79\u8D66\u8FB1\u9017\u90C1\u916A\u9699\u96C1\u971C\u9774\u978D
NGram.KANJI_7_33=\u4E4B\u4E86\u4E94\u4EA4\u4EAC\u4ECA\u4ED6\u4EF6\u4EFB\u4F9B\u4FDD\u4FE1\u5143\u5148\u5149\u518D\u5217\u521D\u5305\u5341\u534A\u53C8\u53CD\u53D6\u53D7\u53E3\u53E4\u53EF\u53F2\u53F8\u5404\u5411\u5468\u547D\u54C1\u5546\u5668\u56DB\u56DE\u56E0\u571F\u578B\u57CE\u57DF\u5883\u58EB\u592A\u592E\u5973\u59CB\u59D4\u5B57\u5B58\u5B89\u5B98\u5C11\u5C31\u5C40\u5C55\u5DDD\u5E03\u5E38\u5E9C\u5F15\u5F62\u5F71\u5F97\u5FC3\u60C5\u610F\u624B\u6280\u6301\u63A5\u63A8\u63D0\u652F\u6539\u653E\u6559\u65BD\u65CF\u661F\u66F2\u671D\u672A\u6797\u679C\u6821\u683C\u6B7B\u6BD4\u6C34\u6C5F\u6CB3\u6D3B\u6D41\u6E2F\u6E90\u6F14\u7136\u7248\u738B\u7403\u76F4\u7701\u77E5\u77F3\u7814\u793A\u795E\u798F\u7A0B\u7A76\u7A7A\u7BA1\u7C73\u7F6E\u7F8E\u80B2\u81F3\u822C\u8272\u8457\u88AB\u89E3\u8A00\u8C61\u8D77\u8DEF\u8EAB\u8FD1\u9020\u91CC\u91CF\u91D1\u9650\u9662\u96C6\u975E\u9762\u97F3\u9996\u9999
NGram.KANJI_7_35=\u55C5\u57A2\u58D5\u59E5\u637A\u74E2\u7CE0\u895F
NGram.KANJI_7_37=\u4E19\u4E32\u4E4F\u4E91\u4EC7\u4ED4\u4F0D\u5141\u51E1\u51F6\u51F8\u52AB\u535C\u53C9\u53DB\u540A\u5410\u54C0\u559D\u5750\u5751\u576A\u57E0\u5824\u582A\u5830\u5835\u5851\u5858\u586B\u5954\u59FB\u5A46\u5B5F\u5BB4\u5BD3\u5C16\u5C60\u5CFB\u5D16\u5E16\u5E3D\u5E7D\u5E87\u5ECA\u5FD9\u60DC\u60F9\u6155\u6167\u6234\u626E\u6276\u6284\u633A\u6377\u6492\u649E\u64B0\u6562\u6591\u65A5\u65E6\u65FA\u6602\u670B\u676D\u68AF\u695A\u6B23\u6BC5\u6C70\u6C83\u6CE1\u6D8C\u6DD8\u6E20\u71D5\u72D0\u72D7\u73B2\u73CA\u7433\u7483\u74DC\u74F6\u7554\u764C\u7761\u77DB\u78A7\u7A46\u7A7F\u7A84\u7C97\u7D2F\u7FC1\u7FE0\u8000\u8017\u808C\u80AF\u8404\u8461\u8463\u8475\u8513\u85AA\u8679\u86CB\u871C\u87BA\u88F8\u8C8C\u8DF3\u8FC4\u901D\u9022\u906E\u9075\u9192\u91C7\u966A\u971E\u9910\u9B41\u9F0E\u9F20

View file

@ -8,6 +8,8 @@ using System.Threading.Tasks;
using MediaBrowser.Model.MediaInfo;
using MediaBrowser.Model.Logging;
using UniversalDetector;
using NLangDetect.Core;
using MediaBrowser.Model.Serialization;
namespace Emby.Common.Implementations.TextEncoding
{
@ -15,11 +17,13 @@ namespace Emby.Common.Implementations.TextEncoding
{
private readonly IFileSystem _fileSystem;
private readonly ILogger _logger;
private IJsonSerializer _json;
public TextEncoding(IFileSystem fileSystem, ILogger logger)
public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
{
_fileSystem = fileSystem;
_logger = logger;
_json = json;
}
public Encoding GetASCIIEncoding()
@ -63,6 +67,7 @@ namespace Emby.Common.Implementations.TextEncoding
}
}
private bool _langDetectInitialized;
public string GetDetectedEncodingName(byte[] bytes, string language)
{
var encoding = GetInitialEncoding(bytes);
@ -72,6 +77,22 @@ namespace Emby.Common.Implementations.TextEncoding
return "utf-8";
}
if (!_langDetectInitialized)
{
_langDetectInitialized = true;
LanguageDetector.Initialize(_json);
}
if (string.IsNullOrWhiteSpace(language))
{
language = DetectLanguage(bytes);
if (!string.IsNullOrWhiteSpace(language))
{
_logger.Debug("Text language detected as {0}", language);
}
}
var charset = DetectCharset(bytes, language);
if (!string.IsNullOrWhiteSpace(charset))
@ -95,6 +116,35 @@ namespace Emby.Common.Implementations.TextEncoding
return null;
}
private string DetectLanguage(byte[] bytes)
{
try
{
return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
try
{
return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
try
{
return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
return null;
}
public Encoding GetEncodingFromCharset(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
@ -136,22 +186,29 @@ namespace Emby.Common.Implementations.TextEncoding
case "cze":
case "ces":
case "slo":
case "slk":
case "slv":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rup":
return "windows-1250";
// albanian
case "alb":
case "sqi":
return "windows-1250";
// slovak
case "slk":
case "slv":
return "windows-1250";
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
return "windows-1253";
// greek
case "gre":
case "ell":
return "windows-1253";
case "crh":
case "ota":

View file

@ -561,7 +561,7 @@ namespace Emby.Server.Core
StringExtensions.LocalizationManager = LocalizationManager;
RegisterSingleInstance(LocalizationManager);
ITextEncoding textEncoding = new TextEncoding(FileSystemManager, LogManager.GetLogger("TextEncoding"));
ITextEncoding textEncoding = new TextEncoding(FileSystemManager, LogManager.GetLogger("TextEncoding"), JsonSerializer);
RegisterSingleInstance(textEncoding);
Utilities.EncodingHelper = textEncoding;
RegisterSingleInstance<IBlurayExaminer>(() => new BdInfoExaminer(FileSystemManager, textEncoding));

View file

@ -82,16 +82,16 @@ namespace MediaBrowser.Api.UserLibrary
[ApiMember(Name = "AiredDuringSeason", Description = "Gets all episodes that aired during a season, including specials.", IsRequired = false, DataType = "int", ParameterType = "query", Verb = "GET")]
public int? AiredDuringSeason { get; set; }
[ApiMember(Name = "MinPremiereDate", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
[ApiMember(Name = "MinPremiereDate", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
public string MinPremiereDate { get; set; }
[ApiMember(Name = "MinDateLastSaved", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
[ApiMember(Name = "MinDateLastSaved", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
public string MinDateLastSaved { get; set; }
[ApiMember(Name = "MinDateLastSavedForUser", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
[ApiMember(Name = "MinDateLastSavedForUser", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
public string MinDateLastSavedForUser { get; set; }
[ApiMember(Name = "MaxPremiereDate", Description = "Optional. The maximum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
[ApiMember(Name = "MaxPremiereDate", Description = "Optional. The maximum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
public string MaxPremiereDate { get; set; }
[ApiMember(Name = "HasOverview", Description = "Optional filter by items that have an overview or not.", IsRequired = false, DataType = "bool", ParameterType = "query", Verb = "GET")]