Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
acd8964
preparing to convert LDATransform to the IEstimator/ITransformer para…
abgoswam Oct 24, 2018
cd2f20c
Added ColumnInfo and TransformInfo
abgoswam Oct 24, 2018
c289ed1
existing tests pass after refactor
abgoswam Oct 25, 2018
8dfb527
fix re-arragement of arguments in entrypoints
abgoswam Oct 25, 2018
bcb3b0d
refactor + cleanup of TopicSummary
abgoswam Oct 25, 2018
1d39408
enabled OnFit to return LdaState
abgoswam Oct 27, 2018
9ff60f7
fixed merge conflicts
abgoswam Nov 1, 2018
ad36d2f
fix build issues after merge; fix review comments
abgoswam Nov 1, 2018
b0e0375
taking care of review comments - 1
abgoswam Nov 10, 2018
b0422e4
Merge branch 'master' into abgoswam/LDA_conversion
abgoswam Nov 10, 2018
7bc6e2b
merge with master; re-enable LDA tests before taking care of addition…
abgoswam Nov 10, 2018
e42c5e4
review comments - 2. rename LdaTransform to LdaTransformer
abgoswam Nov 10, 2018
c099d4a
review comments - 3. throw ExceptSchemaMismatch; default values; ToIm…
abgoswam Nov 11, 2018
a1d14ed
review comments - 4. output column; expression body definition
abgoswam Nov 12, 2018
3f39a04
review comments - 4; added a basic test that exercises TestEstimatorC…
abgoswam Nov 12, 2018
57cd1c5
review comments - 5; rename _exes to _columns; preparing changes for …
abgoswam Nov 12, 2018
d4a4283
review comments - 6; make training a private static method.
abgoswam Nov 12, 2018
c7fb50a
review comments - 7; make Create() method private
abgoswam Nov 12, 2018
d7660ca
review comments - 8; added internal constructor for ColumnInfo()
abgoswam Nov 12, 2018
e0d501b
review comments - 9; fixed types for Single, Float
abgoswam Nov 12, 2018
c91afbb
review comments - 10; made LdaState internal, expose LDA summary info…
abgoswam Nov 13, 2018
4238fa1
review comments - 11; added a command line unit test
abgoswam Nov 13, 2018
34bb2e9
review comments - 12; no-op when there is no data (maml command line …
abgoswam Nov 13, 2018
8b70ab1
review comments - 13; added mlcontext extension for LDA
abgoswam Nov 14, 2018
b3c1284
Merge branch 'master' into abgoswam/LDA_conversion
abgoswam Nov 14, 2018
b6e4028
review comments - 14; code refactor; avoid using abbreviation(Lda); r…
abgoswam Nov 15, 2018
5397de5
review comments - 15; schema changes
abgoswam Nov 15, 2018
edd60af
review comments - 15; provide better user-facing description. renamed…
abgoswam Nov 15, 2018
b073038
review comments - 16; fix build break
abgoswam Nov 15, 2018
49da3ee
review comments - 16; include words in LdaSummary (this also resolves…
abgoswam Nov 16, 2018
0724290
fixing merge conflicts
abgoswam Nov 16, 2018
5073baa
fixing merge conflicts
abgoswam Nov 17, 2018
d1481f8
fix build break because of manifest changes
abgoswam Nov 17, 2018
65125d4
Merge branch 'master' into abgoswam/LDA_conversion
abgoswam Nov 20, 2018
b869d7f
updated to latest interface changes
abgoswam Nov 20, 2018
62955a8
review comments - 17; named tuple, namespace change, fix CheckParams
abgoswam Nov 20, 2018
40333a7
review comments - 18; ImmutableArray
abgoswam Nov 20, 2018
850856b
review comments - 19; update summary text; remove dup code in constru…
abgoswam Nov 20, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
review comments - 5; rename _exes to _columns; preparing changes for …
…next iteration (i.e. make training a private sttaic method. removed _types as field)
  • Loading branch information
abgoswam committed Nov 12, 2018
commit 57cd1c5bd4eba48b208b7551deebee636538ced5
57 changes: 26 additions & 31 deletions src/Microsoft.ML.Transforms/Text/LdaTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,10 @@ public override Schema.Column[] GetOutputColumns()
{
var result = new Schema.Column[_parent.ColumnPairs.Length];
for (int i = 0; i < _parent.ColumnPairs.Length; i++)
result[i] = new Schema.Column(_parent.ColumnPairs[i].output, _parent._types[i], null);
{
var info = _parent._columns[i];
result[i] = new Schema.Column(_parent.ColumnPairs[i].output, new VectorType(NumberType.Float, info.NumTopic), null);
Copy link
Contributor

@Zruty0 Zruty0 Nov 14, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_parent.ColumnPairs[i].output [](start = 50, length = 29)

is this equal to info.Input? #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is equal to info.Output. If output column name is not specified then info.Output is same as info.Input.

Is there any concern here ?


In reply to: 233584111 [](ancestors = 233584111)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, info.Output is shorter, so I'd rather use it.


In reply to: 233740244 [](ancestors = 233740244,233584111)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keeping this as-is

We are using SaveColumns from OneToOneTransformer base. So when loading up models, we should be using _parent.ColumnPairs[i].output (info.Output may be null in those cases)


In reply to: 234857190 [](ancestors = 234857190,233740244,233584111)

}
return result;
}

Expand Down Expand Up @@ -684,9 +687,8 @@ private static VersionInfo GetVersionInfo()
loaderAssemblyName: typeof(LdaTransformer).Assembly.FullName);
}

private readonly ColumnInfo[] _exes;
private readonly ColumnInfo[] _columns;
private readonly LdaState[] _ldas;
private readonly ColumnType[] _types;

private const string RegistrationName = "LightLda";
private const string WordTopicModelFilename = "word_topic_summary.txt";
Expand All @@ -703,14 +705,9 @@ private static (string input, string output)[] GetColumnPairs(ColumnInfo[] colum
internal LdaTransformer(IHostEnvironment env, IDataView input, ColumnInfo[] columns)
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(LdaTransformer)), GetColumnPairs(columns))
{
_exes = columns;
_types = new ColumnType[columns.Length];
_columns = columns;
_ldas = new LdaState[columns.Length];

for (int i = 0; i < columns.Length; i++)
{
_types[i] = new VectorType(NumberType.Float, _exes[i].NumTopic);
}
using (var ch = Host.Start("Train"))
{
Train(ch, input, _ldas);
Copy link
Contributor

@artidoro artidoro Nov 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Train [](start = 16, length = 5)

Pete suggested that we make training a private static method in the transform, and that we have a constructor for the transform that will accept the trained parameters as an input.

This should make it easier to allow initialization of the transform using parameters coming from an external training routine. It is not something that we are exposing now, however.

This allows the transform to only keep the fields storing trained parameters. For example _types should no longer be needed as a field in the transform. #Closed

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have implemented something like this for whitening.


In reply to: 231639657 [](ancestors = 231639657)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed. thanks for giving the context about this approach.


In reply to: 231640188 [](ancestors = 231640188,231639657)

Expand All @@ -728,14 +725,12 @@ private LdaTransformer(IHost host, ModelLoadContext ctx) : base(host, ctx)

// Note: columnsLength would be just one in most cases.
var columnsLength = ColumnPairs.Length;
_exes = new ColumnInfo[columnsLength];
_columns = new ColumnInfo[columnsLength];
_ldas = new LdaState[columnsLength];
Copy link
Contributor

@Zruty0 Zruty0 Nov 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LdaState [](start = 24, length = 8)

are these guys reentrant? #Closed

Copy link
Member Author

@abgoswam abgoswam Nov 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you kindly clarify what you mean by 're-entrant' ?

in any case I have made LdaState internal, so hopefully its not an issue anymore


In reply to: 232437876 [](ancestors = 232437876)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re-entrant means that any methods can be called in parallel from multiple threads


In reply to: 232470000 [](ancestors = 232470000,232437876)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am going by existing comments here..
"LdaSingleBox.InitializeBeforeTest() is NOT thread-safe"

He have locks in place to make it thread-safe.


In reply to: 233585350 [](ancestors = 233585350,232470000,232437876)

_types = new ColumnType[columnsLength];
for (int i = 0; i < _ldas.Length; i++)
{
_ldas[i] = new LdaState(Host, ctx);
_exes[i] = _ldas[i].InfoEx;
_types[i] = new VectorType(NumberType.Float, _ldas[i].InfoEx.NumTopic);
_columns[i] = _ldas[i].InfoEx;
}
}

Expand Down Expand Up @@ -861,13 +856,13 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states)
Host.AssertValue(ch);
ch.AssertValue(trainingData);
ch.AssertValue(states);
ch.Assert(states.Length == _exes.Length);
ch.Assert(states.Length == _columns.Length);

bool[] activeColumns = new bool[trainingData.Schema.ColumnCount];
int[] numVocabs = new int[_exes.Length];
int[] srcCols = new int[_exes.Length];
int[] numVocabs = new int[_columns.Length];
int[] srcCols = new int[_columns.Length];

for (int i = 0; i < _exes.Length; i++)
for (int i = 0; i < _columns.Length; i++)
{
if (!trainingData.Schema.TryGetColumnIndex(ColumnPairs[i].input, out int srcCol))
throw Host.ExceptSchemaMismatch(nameof(trainingData), "input", ColumnPairs[i].input);
Expand All @@ -880,13 +875,13 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states)
//the current lda needs the memory allocation before feedin data, so needs two sweeping of the data,
//one for the pre-calc memory, one for feedin data really
//another solution can be prepare these two value externally and put them in the beginning of the input file.
long[] corpusSize = new long[_exes.Length];
int[] numDocArray = new int[_exes.Length];
long[] corpusSize = new long[_columns.Length];
int[] numDocArray = new int[_columns.Length];

using (var cursor = trainingData.GetRowCursor(col => activeColumns[col]))
{
var getters = new ValueGetter<VBuffer<Double>>[_exes.Length];
for (int i = 0; i < _exes.Length; i++)
var getters = new ValueGetter<VBuffer<Double>>[_columns.Length];
for (int i = 0; i < _columns.Length; i++)
{
corpusSize[i] = 0;
numDocArray[i] = 0;
Expand All @@ -898,7 +893,7 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states)
while (cursor.MoveNext())
{
++rowCount;
for (int i = 0; i < _exes.Length; i++)
for (int i = 0; i < _columns.Length; i++)
{
int docSize = 0;
getters[i](ref src);
Expand All @@ -914,7 +909,7 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states)
break;
}

if (docSize >= _exes[i].NumMaxDocToken - termFreq)
if (docSize >= _columns[i].NumMaxDocToken - termFreq)
break; //control the document length

//if legal then add the term
Expand All @@ -934,7 +929,7 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states)
}
}

for (int i = 0; i < _exes.Length; ++i)
for (int i = 0; i < _columns.Length; ++i)
{
if (numDocArray[i] != rowCount)
{
Expand All @@ -945,9 +940,9 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states)
}

// Initialize all LDA states
for (int i = 0; i < _exes.Length; i++)
for (int i = 0; i < _columns.Length; i++)
{
var state = new LdaState(Host, _exes[i], numVocabs[i]);
var state = new LdaState(Host, _columns[i], numVocabs[i]);
if (numDocArray[i] == 0 || corpusSize[i] == 0)
throw ch.Except("The specified documents are all empty in column '{0}'.", ColumnPairs[i].input);

Expand All @@ -957,11 +952,11 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states)

using (var cursor = trainingData.GetRowCursor(col => activeColumns[col]))
{
int[] docSizeCheck = new int[_exes.Length];
int[] docSizeCheck = new int[_columns.Length];
// This could be optimized so that if multiple trainers consume the same column, it is
// fed into the train method once.
var getters = new ValueGetter<VBuffer<Double>>[_exes.Length];
for (int i = 0; i < _exes.Length; i++)
var getters = new ValueGetter<VBuffer<Double>>[_columns.Length];
for (int i = 0; i < _columns.Length; i++)
{
docSizeCheck[i] = 0;
getters[i] = RowCursorUtils.GetVecGetterAs<Double>(NumberType.R8, cursor, srcCols[i]);
Expand All @@ -971,13 +966,13 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states)

while (cursor.MoveNext())
{
for (int i = 0; i < _exes.Length; i++)
for (int i = 0; i < _columns.Length; i++)
{
getters[i](ref src);
docSizeCheck[i] += states[i].FeedTrain(Host, in src);
}
}
for (int i = 0; i < _exes.Length; i++)
for (int i = 0; i < _columns.Length; i++)
{
Host.Assert(corpusSize[i] == docSizeCheck[i]);
states[i].CompleteTrain();
Expand Down