public class SentenceModel
{
public SentenceModel(string text)
{
Text = text;
}
public SentenceModel(string text, double category)
{
Text = text;
Category = category;
}
public string Text { get; private set; }
public double Category { get; private set; }
}
/// <summary>
/// Loads data from the file
/// </summary>
/// <param name="fileName">Filename</param>
/// <param name="sheet">Name of the sheet</param>
/// <param name="columns">Columns which needs to fetch,if it is null all columns will be fetched </param>
public DataSet LoadData(string fileName, string sheet, string[] columns = null)
{
if (!File.Exists(fileName))
throw new FileNotFoundException();
var connStr =
string.Format(
"Provider=Microsoft.ACE.OLEDB.12.0;Data Source={0};Extended Properties=\"Excel 12.0 Xml;HDR=YES;IMEX=1\";",
fileName);
//Create dataset
var dataSet = new DataSet();
dataSet.Tables.Add(new DataTable(sheet));
var table = dataSet.Tables[0];
if (columns != null)
for (var i = 0; i <= columns.Length - 1; i++)
table.Columns.Add(columns[i]);
var items = new List<object>();
using (var conn = new OleDbConnection(connStr))
{
conn.Open();
var sql = string.Format(@"SELECT * FROM [{0}$]", sheet);
var cmd = new OleDbCommand(sql, conn);
var reader = cmd.ExecuteReader();
if (columns == null)
for (var i = 0; i <= reader.FieldCount - 1; i++)
table.Columns.Add(reader.GetName(i));
while (reader != null && reader.Read())
if (columns != null)
{
items.Clear();
for (var d = 0; d <= table.Columns.Count - 1; d++)
items.Add(reader[d]);
var dataRow = table.NewRow();
dataRow.ItemArray = items.ToArray();
table.Rows.Add(dataRow);
}
else
{
items.Clear();
for (var d = 0; d <= reader.FieldCount - 1; d++)
items.Add(reader[d]);
var dataRow = table.NewRow();
dataRow.ItemArray = items.ToArray();
table.Rows.Add(dataRow);
}
return dataSet;
}
}
//Loads dataset from the dataset file
var data = LoadData("Filename of the dataset file", "Book1", new[] { "Text", "Category" });
//Define list model for stencemodel
var sentences = new List<SentenceModel>();
//Insert all comments to the list
foreach (DataRow row in data.Tables[0].Rows)
{
var sentences = new SentenceModel(row["Text"].ToString(), Convert.ToDouble(row["Category"].ToString()));
sentences.Add(sentences);
}
public class WordBagItem
{
public WordBagItem(string word, int id)
{
Word = word;
Id = id;
}
public string Word { get; private set; }
public int Id { get; private set; }
}
private void PrepareWordBag()
{
for (var i = 0; i < sentences.Count(); i++)
{
var words = (sentences[i].Text.ExtractFeatures().FilterFeatures());
foreach (var t in
from t in words
let wrd = wordBag.FirstOrDefault(x => x.Word.ToLower() == t)
where wrd == null
select t)
wordBag.Add(new WordBagItem(t.Trim().ToLower(), wordBag.Count + 1));
}
}
public static class Helper
{
public static IEnumerable<string> ExtractFeatures(this string text)
{
return Regex.Replace(text, "\\p{P}+", "").Split(' ').ToList();
}
public static IEnumerable<string> FilterFeatures(this IEnumerable<string> list)
{
var filters = new[]
{
".", ",", "!", "?", ":", ";", "_", "+", "/", @"\", "*", "the", "of", "on", "is", "a", "...", "1", "2", "3",
"4", "5", "6", "7", "8", "9", "0", " ", "for", "an", "", "it", "she", "he", "they", "we", "them", "our",
"his", "her"
};
return list.Where(x => !filters.Contains(x)).ToList();
}
}
public class NaiveBayesTextModel
{
[AiLabel]
public double Label { get; set; }
[AiField]
public double WordId { get; set; }
public string Word { get; set; }
}
var dataSet = new List<NaiveBayesTextModel>();
foreach (var t in sentences)
{
var words = t.Text.ExtractFeatures();
foreach (var t1 in words)
{
var naiveBayesTextModel = new NaiveBayesTextModel();
var firstOrDefault = wordBag.FirstOrDefault(x => x.Word == t1.ToLower());
if (firstOrDefault == null) continue;
naiveBayesTextModel.WordId = firstOrDefault.Id;
naiveBayesTextModel.Label = t.Category;
naiveBayesTextModel.Word = t1.ToLower();
dataSet.Add(naiveBayesTextModel);
}
}
"Word" term which is above can be any word from our dataset. Lets say we have a word as "Great", so when we want to calculate probability according to type we need to use "Great" word instead of the Word in the formula (in our case we use Id which we generate for each word).
Now, we are ready to start implementation of Naive Bayes Multinomial.
The codes which you see below, belongs to one of the my open source machine learning library Ellipses.
namespace Ellipses.Interfaces
{
public interface INaiveBayes
{
/// <summary>
/// Load data set
/// </summary>
/// <param name="models">Data set</param>
/// <param name="normalization">Normalize data</param>
void LoadDataSet<T>(T[] models, bool normalization = false);
/// <summary>
/// Trains model for prediction
/// </summary>
INaiveBayesPredicter Fit();
}
}
/* ========================================================================
* Ellipses Machine Learning Library 1.0
* https://www.ellipsesai.com
* ========================================================================
*
* Copyright Ali Gulum
*
* ========================================================================
* Licensed under the Creative Commons Attribution-NonCommercial 4.0 International License;
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://creativecommons.org/licenses/by-nc/4.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ========================================================================
*/
using System.Collections.Generic;
using System.Linq;
using Ellipses.Helpers;
using Ellipses.Interfaces;
using Ellipses.Metrics;
namespace Ellipses.Algorithms.Nb
{
public class NaiveBayes : INaiveBayes
{
//Trainer
private readonly INaiveBayesAlgorithm _algorithm;
//Helper class for converting models
private readonly IConverter _converter;
//Normalizer
private readonly INormalizer _normalizer;
//Data set as double array list
private double[][] _dataSet;
//Data set as matrix
private Matrix _matrix;
//Labels as matrix
private Matrix _matrixLabel;
/// <summary>
/// Naive Bayes
/// </summary>
/// <param name="algorithm">Algorithm for the naive bayes algorithm</param>
/// <param name="normalizer">Normalizer</param>
/// <param name="converter">Converter for the models</param>
public NaiveBayes(INaiveBayesAlgorithm algorithm = null, INormalizer normalizer = null,
IConverter converter = null)
{
_algorithm = algorithm ?? new NaiveBayesBinaryAlgorithm();
_normalizer = normalizer ?? new Normalizer();
_converter = converter ?? new Converter();
}
/// <summary>
/// Load data set
/// </summary>
/// <param name="models">Data set</param>
/// <param name="normalization">Normalize data</param>
public void LoadDataSet<T>(T[] models, bool normalization = false)
{
var isDimensional = _converter.IsDimensionalFieldExist(models);
_dataSet = _converter.ConvertModels(models);
if (normalization)
_dataSet = _normalizer.Normalize(_dataSet);
_matrix = new Matrix(_dataSet);
_matrixLabel = _converter.ConvertLabelsToMatrix(models);
if (isDimensional)
PrepareDimensionalData(_converter.GetDimensionalData(models, normalization));
}
/// <summary>
/// Trains model for prediction
/// </summary>
public INaiveBayesPredicter Fit()
{
//Calculate probabilities of the labels
_algorithm.ComputeProbabilityOfLabels(_matrixLabel);
//Calculate conditional probabilities of the features
_algorithm.ComputeProbabilityOfFeatures(_matrix, _matrixLabel);
//Prepeare and return trained model for prediction
var naiveBayesPredicter = _algorithm.GetPredictor();
return naiveBayesPredicter;
}
#region Helpers
/// <summary>
/// Prepares dimensional data
/// </summary>
/// <param name="dimensionalMatrix">Matrix of dimensional data</param>
private void PrepareDimensionalData(Matrix dimensionalMatrix)
{
var matrixValues = new List<double[]>();
var dimensionalMatrixValues = new List<double[]>();
var newMatrix = new List<double[]>();
var newLabelList = new List<double[]>();
for (var dRow = 0; dRow < dimensionalMatrix.Rows; dRow++)
dimensionalMatrixValues.Add(dimensionalMatrix[dRow].ToArray());
for (var mRow = 0; mRow < _matrix.Rows; mRow++)
matrixValues.Add(_matrix[mRow].ToArray());
for (var dVal = 0; dVal < dimensionalMatrixValues.Count; dVal++)
{
newMatrix.Add(matrixValues[dVal].Concat(dimensionalMatrixValues[dVal]).ToArray());
newLabelList.Add(_matrixLabel[dVal].ToArray());
}
_matrix = new Matrix(newMatrix.ToArray());
_matrixLabel = new Matrix(newLabelList.ToArray());
}
#endregion
}
}
using Ellipses.Metrics;
namespace Ellipses.Interfaces
{
public interface INaiveBayesAlgorithm
{
/// <summary>
/// Computes probability of labels
/// </summary>
/// <param name="labelMatrix">Referance of Label Matrix</param>
void ComputeProbabilityOfLabels(Matrix labelMatrix);
/// <summary>
/// Computes probability of features
/// </summary>
/// <param name="matrix">Matrix of data</param>
/// <param name="labelMatrix">Referance of Label Matrix</param>
void ComputeProbabilityOfFeatures(Matrix matrix, Matrix labelMatrix);
/// <summary>
/// Returns predictor for the algorithm
/// </summary>
INaiveBayesPredicter GetPredictor();
}
}
/* ========================================================================
* Ellipses Machine Learning Library 1.0
* https://www.ellipsesai.com
* ========================================================================
*
* Copyright Ali Gulum
*
* ========================================================================
* Licensed under the Creative Commons Attribution-NonCommercial 4.0 International License;
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://creativecommons.org/licenses/by-nc/4.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ========================================================================
*/
using System;
using System.Collections.Concurrent;
using System.Linq;
using System.Threading.Tasks;
using Ellipses.Helpers;
using Ellipses.Interfaces;
using Ellipses.Metrics;
using Ellipses.Models;
namespace Ellipses.Algorithms.Nb
{
public class NaiveBayesMultinomialAlgorithm : INaiveBayesAlgorithm
{
private const double TOLERANCE = 0.1;
//Helper for the various operations
private readonly IHelper _helper;
//Probabilities of the features
private ConcurrentBag<NaiveBayesProbability> _featureProbabilities;
//Matrix of labels
private Matrix _labelMatrix;
//Probabilities of the labels
private ConcurrentDictionary<double, double> _labelProbabilities;
public NaiveBayesMultinomialAlgorithm()
{
_helper = new Helper();
}
/// <summary>
/// Computes probability of labels
/// </summary>
/// <param name="labelMatrix">Referance of Label Matrix</param>
public void ComputeProbabilityOfLabels(Matrix labelMatrix)
{
var labelProbabilities = new ConcurrentDictionary<double, double>();
var labelList = labelMatrix.GetRows().Select(x => x[0]).Distinct().ToList();
for (var i = 0; i < labelList.Count(); i++)
{
var lbl = labelList[i];
var tLbl = labelMatrix.GetRows().Where(x => Math.Abs(x[0] - lbl) < TOLERANCE).ToList().Count();
var pLabel = (double) tLbl/labelMatrix.Rows;
labelProbabilities.TryAdd(lbl, pLabel);
}
_labelProbabilities = labelProbabilities;
}
/// <summary>
/// Computes probability of features
/// </summary>
/// <param name="matrix">Matrix of data</param>
/// <param name="labelMatrix">Referance of Label Matrix</param>
public void ComputeProbabilityOfFeatures(Matrix matrix, Matrix labelMatrix)
{
var featureProbabilities = new ConcurrentBag<NaiveBayesProbability>();
//Connect matrix
var connectedMatrix = matrix.ConnectMatrix(labelMatrix);
var labelList = labelMatrix.GetRows().Select(x => x[0]).Distinct().ToList();
var featureLength = connectedMatrix.Cols;
var matrixRows = connectedMatrix.GetRows();
var vectors = matrixRows as Vector[] ?? matrixRows.ToArray();
Parallel.For(0, labelList.Count, l =>
{
Parallel.For(0, connectedMatrix.Rows, i =>
{
var fTotalAccordingToLabel =
vectors.Count(x => Math.Abs(x[featureLength - 1] - labelList[l]) < TOLERANCE);
var lUnique = connectedMatrix[i, featureLength - 1];
Parallel.For(0, matrix.Cols, j =>
{
var f = connectedMatrix[i, j];
var fTotal =
vectors.Count(
x =>
Math.Abs(x[featureLength - 1] - labelList[l]) < TOLERANCE &&
Math.Abs(x[j] - f) < TOLERANCE);
var fUniqueTotal = matrix.Select(x => x[j]).Distinct().ToList().Count;
var fProbability = ((double) fTotal + 1)/((double) fTotalAccordingToLabel + fUniqueTotal);
var probability = new NaiveBayesProbability
{
FeatureIndex = j,
FeatureProbability = Math.Abs(fProbability),
Feature = f,
FeatureTotal = fTotal,
Label = labelList[l],
LabelUnique = lUnique
};
featureProbabilities.Add(probability);
});
});
});
_labelMatrix = labelMatrix;
_featureProbabilities = featureProbabilities;
}
/// <summary>
/// Returns predictor for the algorithm
/// </summary>
public INaiveBayesPredicter GetPredictor()
{
return new NaiveBayesMultinomialPredicter(_labelProbabilities, _featureProbabilities, _labelMatrix);
}
}
}
using System.Collections.Generic;
using Ellipses.Models;
namespace Ellipses.Interfaces
{
public interface INaiveBayesPredicter
{
/// <summary>
/// Predicts according to model
/// </summary>
/// <param name="model">Model for prediction label</param>
double Predict<T>(T model);
/// <summary>
/// Predicts according to model and return all probabilities for all labels
/// </summary>
/// <param name="model">Model for prediction label</param>
List<NaiveBayesResult> PredictWithProbabilities<T>(T model);
}
}
/* ========================================================================
* Ellipses Machine Learning Library 1.0
* https://www.ellipsesai.com
* ========================================================================
*
* Copyright Ali Gulum
*
* ========================================================================
* Licensed under the Creative Commons Attribution-NonCommercial 4.0 International License;
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://creativecommons.org/licenses/by-nc/4.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ========================================================================
*/
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Ellipses.Helpers;
using Ellipses.Interfaces;
using Ellipses.Metrics;
using Ellipses.Models;
namespace Ellipses.Algorithms.Nb
{
internal class NaiveBayesMultinomialPredicter : INaiveBayesPredicter
{
private const double TOLERANCE = 0.1;
//Helper class for converting models
private readonly IConverter _converter;
//Probabilities of the features
private readonly ConcurrentBag<NaiveBayesProbability> _featureProbabilities;
//Probabilities of the labels
private readonly ConcurrentDictionary<double, double> _labelProbabilities;
//Labels
private readonly List<double> _lbls;
/// <summary>
/// Naive Bayes Predicter
/// </summary>
/// <param name="labelProbabilities">Probability set for the labels</param>
/// <param name="featureProbabilities">Probability set for the features</param>
/// <param name="labelMatrix">Matrix of label</param>
public NaiveBayesMultinomialPredicter(ConcurrentDictionary<double, double> labelProbabilities,
ConcurrentBag<NaiveBayesProbability> featureProbabilities, Matrix labelMatrix)
{
_converter = new Converter();
_labelProbabilities = labelProbabilities;
_featureProbabilities = featureProbabilities;
_lbls = labelMatrix.GetRows().Select(x => x[0]).Distinct().ToList();
}
/// <summary>
/// Predicts according to model
/// </summary>
/// <param name="model">Model for prediction label</param>
public double Predict<T>(T model)
{
return GetPrediction(model).Aggregate((l, r) => l.Value > r.Value ? l : r).Key;
}
/// <summary>
/// Predicts according to model and return all probabilities for all labels
/// </summary>
/// <param name="model">Model for prediction label</param>
public List<NaiveBayesResult> PredictWithProbabilities<T>(T model)
{
return
GetPrediction(model).Select(v => new NaiveBayesResult {Label = v.Key, Probability = v.Value}).ToList();
}
/// <summary>
/// Predicts according to model and return all probabilities for all labels
/// </summary>
/// <param name="model">Model for prediction label</param>
private ConcurrentDictionary<double, double> GetPrediction<T>(T model)
{
var probabilities = new ConcurrentDictionary<double, double>();
var modelConverted = _converter.ConvertModel(model);
var dimensionalProcess = _converter.IsDimensional(model);
if (dimensionalProcess)
{
var newList = new List<double[]>();
var dimensionalFeatures = _converter.ConvertDimensionalModel(model);
var orjModelValues = modelConverted.Select(t => t.ToArray()).ToList();
for (var dimensionalRow = 0; dimensionalRow < dimensionalFeatures.Length; dimensionalRow++)
newList.AddRange(orjModelValues.Select(t => t.Concat(dimensionalFeatures[dimensionalRow]).ToArray()));
modelConverted = newList.ToArray();
}
Parallel.For(0, _labelProbabilities.Count, l =>
{
var label = _labelProbabilities.Keys.ElementAt(l);
var fTotalAccordingToLabel =
_featureProbabilities.Where(x => Math.Abs(x.LabelUnique - label) < TOLERANCE).Distinct().Count()/
_labelProbabilities.Count;
var totalDot = 1.0;
Parallel.For(0, modelConverted[0].Count(), i =>
{
var f = modelConverted[0][i];
var fVal =
_featureProbabilities.FirstOrDefault(
x => Math.Abs(x.Feature - f) < TOLERANCE && Math.Abs(x.Label - label) < TOLERANCE);
if (fVal != null)
{
totalDot *= fVal.FeatureProbability;
}
else
{
const int fTotal = 0;
var fUniqueTotal = _featureProbabilities.Select(x => x.Feature).Distinct().ToList().Count;
var fProbability = ((double) fTotal + 1)/((double) fTotalAccordingToLabel + fUniqueTotal);
totalDot *= fProbability;
}
});
var probability = _labelProbabilities[label]*totalDot;
probabilities.TryAdd(label, probability);
});
return probabilities;
}
}
}
var naiveBayes = new NaiveBayes(new NaiveBayesMultinomialAlgorithm());
naiveBayes.LoadDataSet(dataSet.ToArray());
var predicter = naiveBayes.Fit();
var sentences = new SentenceModel("It is a really great product, I like it!");
var words = (sentences.Text.ExtractFeatures());
var values = new List<double>();
var naiveBayesTextModel = new NaiveBayesTextTextModel();
foreach (var t in words)
{
var wrd = wordBag.FirstOrDefault(x => x.Word.ToLower() == t);
if (wrd == null)
wordBag.Add(new WordBagItem(t.Trim().ToLower(), wordBag.Count + 1));
var firstOrDefault = wordBag.FirstOrDefault(x => x.Word == t.ToLower());
if (firstOrDefault == null) continue;
values.Add(firstOrDefault.Id);
}
naiveBayesTextModel.Values = new double[1][];
naiveBayesTextModel.Values[0] = values.ToArray();
var res = predicter.PredictWithProbabilities(naiveBayesTextModel);
Classification
K-Nearest Neighbor is the one of the well-known and easy machine algorithm ...
K-Nearest Neighbor is the one of the well-known and easy machine algorithm which is very suitable for a lot of real world problems such product recommendation, social media friend recommendation based on interest or social network of person.
Classification
The super strong wind of the Machine Learning is turning our heads incredib...
The super strong wind of the Machine Learning is turning our heads incredibly, we see an example of the machine learning usage almost every area of the technology. Face detection, voice recognition, text recognition, etc. there are plenty of them, and each of them has a different type of approach to machine learning.
Classification
In the previous article, we finished the first part of our example project,...
In the previous article, we finished the first part of our example project, now we have SentimentAnalyst class which we can use for training data and making a prediction by passing real data to it. Today we are going to work on Part 2 which is going to be a trainer project, the project which is going to handle the training process.