######”possible solution in end of page”######”
pipeline.Fit is blocking across Ml.net Internal Management
this line will execute sequentially in each thread
so thread 2 cannot do .Fit until thread 1 finishes .Fit
BAD ML.NET
Per-Call Blocking: Even though you can run multiple Fit()
calls in parallel, each Fit()
is still blocking on its own thread. You cannot parallelize the execution of Fit()
call across multiple threads — Fit()
itself will always block the calling thread until it finishes.
so we can not run multiple .Fit() even through multiple threads,task,background task,…. simultaneously
another problem : because the .Fit() run only in one of the threads
and it doesn’t support Executing across multiple CPU cores,
even with 100 CPU cores, you will have a single core with 100% CPU usage!!!!
other cores are free and sleeping.
if anyone has a solution through this native problem of ML.NET please feel free to email me
code example:
public class HouseData
{
public float Size { get; set; }
public float Bedrooms { get; set; }
public float Price { get; set; }
}
public class HousePrediction
{
[ColumnName("Score")]
public float Price { get; set; }
}
var dataPath = "path/to/house_prices.csv";
var mlContext = new MLContext();
IDataView dataView = mlContext.Data.LoadFromTextFile<HouseData>(dataPath, hasHeader: true, separatorChar: ',');
private Hyperparameters SearchBestHyperparameters_TestAll(IDataView dataView)
{
var paramGrid = new List<(float LearningRate, int NumberOfTrees, int NumLeaves)>
{
(0.05f, 100, 30), (0.05f, 100, 50), (0.1f, 100, 50), (0.1f, 200, 50)
// Add more hyperparameter combinations as needed
};
var bestParams = new Hyperparameters();
double bestSuccessRate = double.MinValue;
var semaphore = new SemaphoreSlim(10); // Limit to 10 concurrent tasks
var completedTasksLock = new object();
int totalTasks = paramGrid.Count;
int completedTasks = 0;
List<Task> tasks = new List<Task>();
foreach (var param in paramGrid)
{
semaphore.Wait();
var paramCopy = param;
var task = Task.Run(() =>
{
var mlContext = new MLContext(seed: 0); // New MLContext for each task
try
{
var pipeline = mlContext.Transforms.Concatenate("Features", "Size", "Bedrooms")
.Append(mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options
{
LabelColumnName = "Price",
FeatureColumnName = "Features",
LearningRate = paramCopy.LearningRate,
NumberOfLeaves = paramCopy.NumLeaves,
NumberOfTrees = paramCopy.NumberOfTrees,
MinimumExampleCountPerLeaf = 1,
NumberOfThreads = Environment.ProcessorCount // Use all available cores
}));
// Training the model
//this line will not execute concurrent with
//other threads. it has lock inside ML.net
var model = pipeline.Fit(dataView);
// Use the model to make predictions
var predictions = model.Transform(dataView);
var metrics = mlContext.Regression.Evaluate(predictions, labelColumnName: "Price");
// Update best parameters if current model is better
lock (completedTasksLock)
{
completedTasks++;
if (metrics.RSquared > bestSuccessRate)
{
bestSuccessRate = metrics.RSquared;
bestParams = new Hyperparameters
{
LearningRate = paramCopy.LearningRate,
NumberOfTrees = paramCopy.NumberOfTrees,
NumLeaves = paramCopy.NumLeaves
};
}
Console.WriteLine($"Progress: {(completedTasks / (double)totalTasks) * 100:F2}%");
}
}
finally
{
semaphore.Release(); // Release the semaphore slot
}
});
tasks.Add(task);
}
Task.WhenAll(tasks).Wait(); // Wait for all tasks to complete
return bestParams;
}
// Hyperparameters class
public class Hyperparameters
{
public float LearningRate { get; set; }
public int NumberOfTrees { get; set; }
public int NumLeaves { get; set; }
}
Possible Solution:
Create a program and create a executable from it for training model, then call multiple times, using process parallelism and get result simultaneously
By: Mehdi Sadighian
Contact: mehdi.sadighian@hotmail.com
TAG: C#, ML.net, pipeline.Fit, pipeline, Model Parameter Tuninng