From 95c1b6a7c780752027324ca368c061765be0088a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 15 Nov 2018 15:58:57 -0500 Subject: [PATCH 01/87] [src] Add draft of interface for svd backprop thing. --- src/matrix/matrix-functions.h | 68 +++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index ca50ddda7c8..9a3e7353abd 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -164,6 +164,74 @@ inline void AssertSameDim(const MatrixBase &mat1, const MatrixBase } +/* + This class allows you to compute the class of function described in + http://www.danielpovey.com/files/2018_svd_derivative.pdf + and to backprop through that computation. + Short summary: it allows you to apply some kind of scalar function + to the singular values of a matrix, reconstruct it, and then backprop + through that operation. + + This class is quite general-purpose in the sense that you can + provide any scalar function; but in order to avoid things like + passing function-pointers around, we had give it a rather clunky + interface. The way you are supposed to use it is as follows + (to give an example): + + Matrix A(...); // set it somehow. + SvdRescaler rescaler(A); + const VectorBase &lambda_in = A.InputSingularValues(); + VectorBase &lambda_out = *(A.OutputSingularValues()); + VectorBase &lambda_out_deriv = *(A.OutputSingularValues()); + for (int32 i = 0; i < lambda_in.size(); i++) { + // compute the scalar function and its derivative for the singular + // values. + lambda_out(i) = some_func(lambda_in(i)); + lambda_out_deriv(i) = some_func_deriv(lambda_in(i)); + } + Matrix B(A.NumRows(), A.NumCols(), kUndefined); + rescaler.GetOutput(&B); + // Do something with B. + Matrix B_deriv(...); // Get the derivative w.r.t. B + // somehow. + Matrix A_deriv(A.NumRows(), A.NumCols()); // Get the derivative w.r.t. A. + + + */ +class SvdRescaler { + + // Constructor. The parameter is the input matrix A. + SvdRescaler(const MatrixBase &A); + + // Get the singular values of A, which will have been + // computed in the constructor + const VectorBase &InputSingularValues(); + // Returns a pointer to a place that you can write the + // modified singular values f(lambda). + VectorBase *OutputSingularValues(); + // Returns a pointer to a place that you can write the + // values of f'(lambda) (the function-derivative of f). + VectorBase *OutputSingularValuesDerivs(); + // Outputs F(A) to 'output', which must have the correct size. + // It's OK if 'output' contains NaNs on entry. + // Before calling this, you must have set the values in + // 'OutputSingularValues()'. + void GetOutput(MatrixBase *output); + + // Computes the derivative of some function g w.r.t. the input A, + // given that dg/d(output) is provided in 'output_deriv'. + // This derivative is *added* to 'input_deriv', so you need + // to zero 'input_deriv' or otherwise set it, beforehand. + void ComputeInputDeriv(const MatrixBase &output_deriv, + MatrixBase *input_deriv); + + private: + // TODO. + + +}; + + /// @} end of "addtogroup matrix_funcs_misc" } // end namespace kaldi From 68f703d9dc0e8846de9ea066129c3e968c860f46 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 18 Nov 2018 14:54:44 -0500 Subject: [PATCH 02/87] [src] Add interface to handle symmetric matrices in SvdRescaler. --- src/matrix/matrix-functions.h | 51 +++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 9a3e7353abd..6f37cd8bf4f 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -200,8 +200,20 @@ inline void AssertSameDim(const MatrixBase &mat1, const MatrixBase */ class SvdRescaler { - // Constructor. The parameter is the input matrix A. - SvdRescaler(const MatrixBase &A); + /* + Constructor. + 'A' is the input matrix. See class-level documentation above for + more information. + + If 'symmetric' is set to true, then the user is asserting that A is + symmetric, and that that symmetric structure needs to be preserved in the + output. In this case, we use code for the symmetric eigenvalue problem to + do the decomposition instead of the SVD. I.e. decompose A = P diag(s) P^T + instead of A = U diag(s) V^T, using SpMatrix::Eig(). You can view this as a + special case of SVD. + */ + SvdRescaler(const MatrixBase &A, + bool symmetric = false); // Get the singular values of A, which will have been // computed in the constructor @@ -232,6 +244,41 @@ class SvdRescaler { }; +class EigRescaler { + // Constructor. The parameter is the input matrix A. + EigRescaler(const SpMatrix &A); + + // Get the singular values of A, which will have been + // computed in the constructor + const VectorBase &InputSingularValues(); + // Returns a pointer to a place that you can write the + // modified singular values f(lambda). + VectorBase *OutputSingularValues(); + // Returns a pointer to a place that you can write the + // values of f'(lambda) (the function-derivative of f). + VectorBase *OutputSingularValuesDerivs(); + // Outputs F(A) to 'output', which must have the correct size. + // It's OK if 'output' contains NaNs on entry. + // Before calling this, you must have set the values in + // 'OutputSingularValues()'. + void GetOutput(SpMatrix *output); + + // Computes the derivative of some function g w.r.t. the input A, + // given that dg/d(output) is provided in 'output_deriv'. + // This derivative is *added* to 'input_deriv', so you need + // to zero 'input_deriv' or otherwise set it, beforehand. + void ComputeInputDeriv(const SpMatrix &output_deriv, + SpMatrix *input_deriv); + + private: + // TODO. + + +}; + + + + /// @} end of "addtogroup matrix_funcs_misc" } // end namespace kaldi From 96c5f70856b9a39788b3c2f4610891a69d67eb34 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 18 Nov 2018 16:08:03 -0500 Subject: [PATCH 03/87] [src] Modify interface of SvdRescaler, allow to fix singular values of input --- src/matrix/matrix-functions.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 6f37cd8bf4f..852ef74e791 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -215,9 +215,17 @@ class SvdRescaler { SvdRescaler(const MatrixBase &A, bool symmetric = false); - // Get the singular values of A, which will have been - // computed in the constructor - const VectorBase &InputSingularValues(); + // Get the singular values of A, which will have been computed in the + // constructor. The reason why this is not const is that there may be + // situations where you discover that the input matrix has some very small + // singular values, and you want to (say) floor them somehow and reconstruct, + // and have the derivatives be valid assuming you had given that 'repaired' + // matrix A as input. Modifying the elements of this vector gives you + // a way to do that, although currently this class doesn't provide a way + // for you to access that 'fixed-up' A directly. + // We hope you know what you are doing if you modify these singular values. + VectorBase &InputSingularValues(); + // Returns a pointer to a place that you can write the // modified singular values f(lambda). VectorBase *OutputSingularValues(); From fe8afeedebb28893c28eb2af957b58b278eed611 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 18 Nov 2018 17:02:40 -0500 Subject: [PATCH 04/87] [src] Extend interface of SvdRescalar --- src/matrix/matrix-functions.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 852ef74e791..e351494696b 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -215,6 +215,15 @@ class SvdRescaler { SvdRescaler(const MatrixBase &A, bool symmetric = false); + // Constructor that takes no args. In this case you are supposed to + // call Init() + SvdRescaler(); + + // An alternative to the constructor that takes args. Should only be called + // directly after initializing the object with no args. + void Init(const MatrixBase &A, + bool symmetric = false); + // Get the singular values of A, which will have been computed in the // constructor. The reason why this is not const is that there may be // situations where you discover that the input matrix has some very small From 6ed9b7e03f77453a6da8149b4588a611ee5cb775 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 18 Nov 2018 17:57:25 -0500 Subject: [PATCH 05/87] [src] Extend interface of SvdRescaler --- src/matrix/matrix-functions.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index e351494696b..dc7ac9ad6bc 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -220,8 +220,10 @@ class SvdRescaler { SvdRescaler(); // An alternative to the constructor that takes args. Should only be called - // directly after initializing the object with no args. - void Init(const MatrixBase &A, + // directly after initializing the object with no args. Warning: this object + // keeps a reference to this matrix, so don't modify it during the lifetime + // of this object. + void Init(const MatrixBase *A, bool symmetric = false); // Get the singular values of A, which will have been computed in the From 087925dba50909ee6e877b5e93a2f50142616cba Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 18 Nov 2018 18:44:10 -0500 Subject: [PATCH 06/87] [src] Modify interface of SvdRescaler --- src/matrix/matrix-functions.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index dc7ac9ad6bc..a4ea8c10b71 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -240,9 +240,7 @@ class SvdRescaler { // Returns a pointer to a place that you can write the // modified singular values f(lambda). VectorBase *OutputSingularValues(); - // Returns a pointer to a place that you can write the - // values of f'(lambda) (the function-derivative of f). - VectorBase *OutputSingularValuesDerivs(); + // Outputs F(A) to 'output', which must have the correct size. // It's OK if 'output' contains NaNs on entry. // Before calling this, you must have set the values in From ed0c319383abb062966322f280fcc4fdd45d0a13 Mon Sep 17 00:00:00 2001 From: GaofengCheng Date: Mon, 19 Nov 2018 09:17:48 +0800 Subject: [PATCH 07/87] modification1 --- src/matrix/matrix-functions.cc | 30 ++++++++++++++++++++++++++++++ src/matrix/matrix-functions.h | 6 +++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 496c09f5344..5dcd4b9d5c7 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -769,5 +769,35 @@ void AddOuterProductPlusMinus(double alpha, MatrixBase *plus, MatrixBase *minus); +SvdRescaler::SvdRescaler(const matrixBase &A): + input_matrix_A_(A) {} + +const Vectorbase &SvdRescaler::InputSingularValues() { + int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), + rc_min = std::min(rows, cols); + Vector s(rc_min); // singular value vector + Matrix U(rows, rc_min), Vt(rc_min, cols); + input_matrix_A_.DestructiveSvd(&s, &U, &Vt); + SortSvd(&s, &U, &Vt); + return s; +} + +VectorBase* SvdRescaler::OutputSingularvalues() { + int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), + rc_min = std::min(rows, cols); + Vector *s(rc_min); + return *s; +} + +VectorBase* SvdRescaler::OutputSingularValuesDerivs() { + int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), + rc_min = std::min(rows, cols); + Vector *s(rc_min); + return *s; +} + +Void SvdRescaler::GetOutput(MatrixBase *output) { + +} } // end namespace kaldi diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 9a3e7353abd..1cd8dda5445 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -226,9 +226,9 @@ class SvdRescaler { MatrixBase *input_deriv); private: - // TODO. - - + MatrixBase input_matrix_A_; + MatrixBase U_, Vt_; + VectorBase lambda_in_, lambda_out_, lambda_out_deriv_; }; From 23a522b7e1ca8d53afc364a2d290ba16d1cb56c3 Mon Sep 17 00:00:00 2001 From: GaofengCheng Date: Mon, 19 Nov 2018 19:27:00 +0800 Subject: [PATCH 08/87] Add tempt code fot SvdRescaler --- src/matrix/matrix-functions.cc | 113 ++++++++++++++++++++++++++------- src/matrix/matrix-functions.h | 14 ++-- 2 files changed, 98 insertions(+), 29 deletions(-) diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 5dcd4b9d5c7..1e801324fd0 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -769,35 +769,102 @@ void AddOuterProductPlusMinus(double alpha, MatrixBase *plus, MatrixBase *minus); -SvdRescaler::SvdRescaler(const matrixBase &A): - input_matrix_A_(A) {} - -const Vectorbase &SvdRescaler::InputSingularValues() { - int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), - rc_min = std::min(rows, cols); - Vector s(rc_min); // singular value vector - Matrix U(rows, rc_min), Vt(rc_min, cols); - input_matrix_A_.DestructiveSvd(&s, &U, &Vt); - SortSvd(&s, &U, &Vt); - return s; +SvdRescaler::SvdRescaler(const matrixBase &A, bool symmetric = false): + input_matrix_A_(A), + symmetric_(symmetric) { + int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), + rc_min = std::min(rows, cols); + Vector s(rc_min); // singular value vector + Matrix U(rows, rc_min), Vt(rc_min, cols); + input_matrix_A_.DestructiveSvd(&s, &U, &Vt); + SortSvd(&s, &U, &Vt); + lambda_in_ = s; + *lambda_out_ = s; + U_ = U; + Vt_ = Vt; + } + +void SvdRescaler::Init(const MatrixBase *A, bool symmetric = false) { + *input_matrix_A_ = A; + symmetric_ = symmetric; } -VectorBase* SvdRescaler::OutputSingularvalues() { - int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), - rc_min = std::min(rows, cols); - Vector *s(rc_min); - return *s; +Vectorbase &SvdRescaler::InputSingularValues() { + return lambda_in_; } -VectorBase* SvdRescaler::OutputSingularValuesDerivs() { - int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), - rc_min = std::min(rows, cols); - Vector *s(rc_min); - return *s; +VectorBase *SvdRescaler::OutputSingularvalues() { + return lambda_out_; } -Void SvdRescaler::GetOutput(MatrixBase *output) { - +VectorBase *SvdRescaler::OutputSingularvaluesDerivs() { + return lambda_out_deriv_; +} + +void SvdRescaler::GetOutput(MatrixBase *output) { + KALDI_ASSERT(output->NumRows() == input_matrix_A_->NumRows() && + output->NumCols() == input_matrix_A_->NumCols()); + MatrixBase U_tmpt(U_); + U_tmpt.MulColsVec(*lambda_out_); + U_tmpt.AddMatMat(1.0, U_tmpt, kNoTrans, Vt_, kNoTrans, 0.0); + *output = U_tmpt; } +void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, + MatrixBase *input_deriv) { + KALDI_ASSERT(output_deriv.NumRows() == U_.NumRows() && + output_deriv.NumCols() == Vt_.NumRows() && + input_deriv.NumRows() == U_.NumRows() && + input_deriv.NumCols() == Vt_.NumRows() && + U_.NumCols() == Vt_.NumRows()); + // \bar{A} + input_deriv->SetZero(); + + // \bar{D} + MatrixBase intermediate_deriv(U_.NumCols(), Vt_.NumCols()); + intermediate_deriv.AddmatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, + Vt_, kNoTrans, 0.0); + // some intermediate variables + // store the diriv of {f'(\lambda_{i})}\times{\bar\d_{i,i}} + VectorBase diagonal_deriv_intermediate(U_.NumCols()); + diagonal_deriv_intermediate.SetZero(); + diagonal_deriv_intermediate.CopyDiagFromMat(intermediate_deriv); + diagonal_deriv_intermediate.MulElements(*lambda_out_deriv_); + // store \lambda_{i} \times d_{i} + MatrixBase diagonal_deriv_intermediate2(U_.NumCols(), U_.NumCols()); + diagonal_deriv_intermediate2.SetZero(); + diagonal_deriv_intermediate2.AddVecVec(1.0, lambda_in_, *lambda_out_); + // store \lambda_{i} \times \lambda_{i} + VectorBase diagonal_deriv_intermediate3(U_.NumCols()); + diagonal_deriv_intermediate3.SetZero(); + diagonal_deriv_intermediate3.AddVec2(1.0, lambda_in_qstat); + + for(MatrixIndexT i = 0; i < U_.NumCols(); i++) + { + for(MatrixIndexT j = 0; j < Vt_.NumCols(); i++) + { + // there may remain bugs! + if ((lambda_in_(i) == 0.0) && (lambda_in_(j) == 0.0) && (i != j)) { + *input_deriv(i, j) = intermediate_deriv(i, j) * lambda_out_deriv_(i); + } else if ((i != j) && (lambda_in_(i) - lambda_in_(j) > 0.0000001)) { + *input_deriv(i, j) = intermediate_deriv(i, j) + *(diagonal_deriv_intermediate2(i, i) - diagonal_deriv_intermediate2(j, j)) + / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)) + + intermediate_deriv(j, i) + *(diagonal_deriv_intermediate2(j, i) - diagonal_deriv_intermediate2(i, j)) + / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)); + } else if ((i != j) && (lambda_in_(i) - lambda_in_(j) < 0.0000001)) { + float lambda_avg = (lambda_in_(i) + lambda_in_(jump)) / 2.0; + *input_deriv(i, j) = intermediate_deriv(i, j) + * (lambda_avg * (*lambda_out_deriv_(i)) + *lambda_out_(i)) + / (2.0 * lambda_avg) + + intermediate_deriv(j, i) + * (lambda_avg * (*lambda_out_deriv_(i)) - *lambda_out_(i)) + / (2.0 * lambda_avg); + } + } + } + input_deriv->CopyDiagFromMat(diagonal_deriv_intermediate); + input_deriv->AddMatMatMat(1.0, U_, kNoTrans, *input_deriv, kNoTrans, Vt_, kTrans); +} } // end namespace kaldi diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 10db9d2fa51..d6d32427053 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -241,6 +241,8 @@ class SvdRescaler { // modified singular values f(lambda). VectorBase *OutputSingularValues(); + VectorBase *OutputSingularValuesDerivs(); + // Outputs F(A) to 'output', which must have the correct size. // It's OK if 'output' contains NaNs on entry. // Before calling this, you must have set the values in @@ -255,9 +257,11 @@ class SvdRescaler { MatrixBase *input_deriv); private: - // TODO. - - + MatrixBase *input_matrix_A_; + bool symmetric_; + MatrixBase U_, Vt_; + VectorBase lambda_in_; + VectorBase *lambda_out_, *lambda_out_deriv_; }; @@ -288,9 +292,7 @@ class EigRescaler { SpMatrix *input_deriv); private: - MatrixBase input_matrix_A_; - MatrixBase U_, Vt_; - VectorBase lambda_in_, lambda_out_, lambda_out_deriv_; + //TODO }; From d15d32bc344d1a237c4578489c76861afaf9490f Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 19 Nov 2018 12:23:03 -0500 Subject: [PATCH 09/87] [src] small fixes to SvdRescaler interface --- src/matrix/matrix-functions.h | 39 +++++------------------------------ 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index a4ea8c10b71..a42a30931b2 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -182,7 +182,7 @@ inline void AssertSameDim(const MatrixBase &mat1, const MatrixBase SvdRescaler rescaler(A); const VectorBase &lambda_in = A.InputSingularValues(); VectorBase &lambda_out = *(A.OutputSingularValues()); - VectorBase &lambda_out_deriv = *(A.OutputSingularValues()); + VectorBase &lambda_out_deriv = *(A.OutputSingularValueDerivs()); for (int32 i = 0; i < lambda_in.size(); i++) { // compute the scalar function and its derivative for the singular // values. @@ -241,6 +241,10 @@ class SvdRescaler { // modified singular values f(lambda). VectorBase *OutputSingularValues(); + // Returns a pointer to a place that you can write the + // values of f'(lambda) (the function-derivative of f). + VectorBase *OutputSingularValueDerivs(); + // Outputs F(A) to 'output', which must have the correct size. // It's OK if 'output' contains NaNs on entry. // Before calling this, you must have set the values in @@ -261,39 +265,6 @@ class SvdRescaler { }; -class EigRescaler { - // Constructor. The parameter is the input matrix A. - EigRescaler(const SpMatrix &A); - - // Get the singular values of A, which will have been - // computed in the constructor - const VectorBase &InputSingularValues(); - // Returns a pointer to a place that you can write the - // modified singular values f(lambda). - VectorBase *OutputSingularValues(); - // Returns a pointer to a place that you can write the - // values of f'(lambda) (the function-derivative of f). - VectorBase *OutputSingularValuesDerivs(); - // Outputs F(A) to 'output', which must have the correct size. - // It's OK if 'output' contains NaNs on entry. - // Before calling this, you must have set the values in - // 'OutputSingularValues()'. - void GetOutput(SpMatrix *output); - - // Computes the derivative of some function g w.r.t. the input A, - // given that dg/d(output) is provided in 'output_deriv'. - // This derivative is *added* to 'input_deriv', so you need - // to zero 'input_deriv' or otherwise set it, beforehand. - void ComputeInputDeriv(const SpMatrix &output_deriv, - SpMatrix *input_deriv); - - private: - // TODO. - - -}; - - /// @} end of "addtogroup matrix_funcs_misc" From df74407bf9432511cb4d32debfa4e2552130bf77 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 19 Nov 2018 16:57:59 -0500 Subject: [PATCH 10/87] [src] Small interface fix --- src/matrix/matrix-functions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index a42a30931b2..10ba536f0d7 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -199,7 +199,7 @@ inline void AssertSameDim(const MatrixBase &mat1, const MatrixBase */ class SvdRescaler { - + public: /* Constructor. 'A' is the input matrix. See class-level documentation above for From 5d106e5bfafae1c8d0347f7cb8a0cc956e6ab9b4 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 19 Nov 2018 19:07:47 -0500 Subject: [PATCH 11/87] [src] Adding draft of core DifferentiableFmllr code. --- src/transform/differentiable-fmllr-test.cc | 50 ++++++ src/transform/differentiable-fmllr.cc | 165 +++++++++++++++++ src/transform/differentiable-fmllr.h | 199 +++++++++++++++++++++ 3 files changed, 414 insertions(+) create mode 100644 src/transform/differentiable-fmllr-test.cc create mode 100644 src/transform/differentiable-fmllr.cc create mode 100644 src/transform/differentiable-fmllr.h diff --git a/src/transform/differentiable-fmllr-test.cc b/src/transform/differentiable-fmllr-test.cc new file mode 100644 index 00000000000..96b50c49cde --- /dev/null +++ b/src/transform/differentiable-fmllr-test.cc @@ -0,0 +1,50 @@ +// transform/differentiable-fmllr-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "transform/differentiable-fmllr.h" + +namespace kaldi { +namespace differentiable_transform { + + +void UnitTestCoreFmllrEstimatorSimple() { + int32 dim = RandInt(10, 20); + BaseFloat gamma = RandInt(5, 10); + Matrix G(dim, dim), + K(dim, dim), A(dim, dim, kUndefined); + G.AddToDiag(1.234 * gamma); + K.AddToDiag(0.234 * gamma); + CoreFmllrEstimatorOptions opts; + CoreFmllrEstimator estimator(opts, gamma, G, K, &A); + estimator.Forward(); + KALDI_LOG << "A is " << A; +} + + +} // namespace kaldi +} // namespace differentiable_transform + + + +int main() { + using namespace kaldi::differentiable_transform; + + UnitTestCoreFmllrEstimatorSimple(); + std::cout << "Test OK.\n"; +} diff --git a/src/transform/differentiable-fmllr.cc b/src/transform/differentiable-fmllr.cc new file mode 100644 index 00000000000..59fc1d59507 --- /dev/null +++ b/src/transform/differentiable-fmllr.cc @@ -0,0 +1,165 @@ +// transform/differentiable-fmllr.cc + +// Copyright 2018 Johns Hopkins University + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "transform/differentiable-fmllr.h" +#include "matrix/matrix-functions.h" + +namespace kaldi { +namespace differentiable_transform { + +CoreFmllrEstimator::CoreFmllrEstimator( + const CoreFmllrEstimatorOptions &opts, + BaseFloat gamma, + const MatrixBase &G, + const MatrixBase &K, + MatrixBase *A): + opts_(opts), gamma_(gamma), + G_(G), K_(K), A_(A) { + KALDI_ASSERT(opts.singular_value_relative_floor > 0.0 && + gamma > 0.0 && G.NumRows() == K.NumRows() && + K.NumRows() == K.NumCols() && + SameDim(K, *A)); +} + + +BaseFloat CoreFmllrEstimator::Forward() { + ComputeH(); + ComputeL(); + ComputeB(); + ComputeA(); + return ComputeObjfChange(); +} + +void CoreFmllrEstimator::ComputeH() { + int32 dim = G_.NumRows(); + bool symmetric = true; + G_rescaler_.Init(&G_, symmetric); + VectorBase &G_singular_values = G_rescaler_.InputSingularValues(); + BaseFloat floor = + G_singular_values.Max() * opts_.singular_value_relative_floor; + KALDI_ASSERT(floor > 0.0); + MatrixIndexT num_floored = 0; + G_singular_values.ApplyFloor(floor, &num_floored); + if (num_floored > 0.0) + KALDI_WARN << num_floored << " out of " << dim + << " singular values floored in G matrix."; + VectorBase + &H_singular_values = *G_rescaler_.OutputSingularValues(), + &H_singular_value_derivs = *G_rescaler_.OutputSingularValueDerivs(); + H_singular_values.CopyFromVec(G_singular_values); + // H is going to be G^{-0.5}. + // We don't have to worry about division by zero because we already floored the + // singular values of G. + H_singular_values.ApplyPow(-0.5); + // the derivative of lambda^{-0.5} w.r.t. lambda is -0.5 lambda^{-1.5}; + // we fill in this value in H_singular_value_derivs. + H_singular_value_derivs.CopyFromVec(G_singular_values); + H_singular_value_derivs.ApplyPow(-1.5); + H_singular_value_derivs.Scale(-0.5); + H_.Resize(dim, dim, kUndefined); + G_rescaler_.GetOutput(&H_); +} + +void CoreFmllrEstimator::ComputeL() { + int32 dim = G_.NumRows(); + L_.Resize(dim, dim); + L_.AddMatMat(1.0, K_, kNoTrans, H_, kNoTrans, 0.0); +} + +// Compute B = F(L), where F is the +// function that takes the singular values of L, puts them through the function +// f(lamba) = (lambda + sqrt(lambda^2 + 4 gamma)) / 2. +void CoreFmllrEstimator::ComputeB() { + int32 dim = L_.NumRows(); + bool symmetric = false; + L_rescaler_.Init(&L_, symmetric); + VectorBase &L_singular_values = L_rescaler_.InputSingularValues(); + BaseFloat floor = + L_singular_values.Max() * opts_.singular_value_relative_floor; + KALDI_ASSERT(floor > 0.0); + MatrixIndexT num_floored = 0; + L_singular_values.ApplyFloor(floor, &num_floored); + if (num_floored > 0.0) + KALDI_WARN << num_floored << " out of " << dim + << " singular values floored in K matrix."; + VectorBase + &B_singular_values = *L_rescaler_.OutputSingularValues(), + &B_singular_value_derivs = *L_rescaler_.OutputSingularValueDerivs(); + // lambda is the original singular value of l, + // f is where we put f(lambda) + // f_prime is where we put f'(lambda) (the derivative of f w.r.t lambda). + BaseFloat *lambda = L_singular_values.Data(), + *f = B_singular_values.Data(), + *f_prime = B_singular_value_derivs.Data(); + + BaseFloat gamma = gamma_; + for (int32 i = 0; i < dim; i++) { + BaseFloat lambda_i = lambda[i]; + f[i] = (lambda_i + std::sqrt(lambda_i * lambda_i + 4.0 * gamma)) / 2.0; + f_prime[i] = (1.0 + lambda_i / + std::sqrt(lambda_i * lambda_i + 4.0 * gamma)) / 2.0; + } + B_.Resize(dim, dim, kUndefined); + L_rescaler_.GetOutput(&B_); +} + +void CoreFmllrEstimator::ComputeA() { + A_->SetZero(); // Make sure there are no NaN's. + A_->AddMatMat(1.0, B_, kNoTrans, H_, kNoTrans, 0.0); +} + +void CoreFmllrEstimator::BackpropA(const MatrixBase &A_deriv, + MatrixBase *B_deriv, + MatrixBase *H_deriv) { + B_deriv->AddMatMat(1.0, A_deriv, kNoTrans, H_, kTrans, 0.0); + H_deriv->AddMatMat(1.0, B_, kTrans, A_deriv, kNoTrans, 0.0); +} + +void CoreFmllrEstimator::BackpropL(const MatrixBase &L_deriv, + MatrixBase *K_deriv, + MatrixBase *H_deriv) { + K_deriv->AddMatMat(1.0, L_deriv, kNoTrans, H_, kTrans, 0.0); + H_deriv->AddMatMat(1.0, K_, kTrans, L_deriv, kNoTrans, 1.0); +} + + +void CoreFmllrEstimator::Backward(const MatrixBase &A_deriv, + Matrix *G_deriv, + Matrix *K_deriv) { + KALDI_ASSERT(SameDim(A_deriv, *A_) && SameDim(A_deriv, *G_deriv) + && SameDim(*G_deriv, *K_deriv)); + int32 dim = A_->NumRows(); + Matrix B_deriv(dim, dim), H_deriv(dim, dim), + L_deriv(dim, dim); + BackpropA(A_deriv, &B_deriv, &H_deriv); + // Backprop through the operation B = F(L). + L_rescaler_.ComputeInputDeriv(B_deriv, &L_deriv); + BackpropL(L_deriv, K_deriv, &H_deriv); + // Backprop through the operation H = G^{-0.5}. + G_rescaler_.ComputeInputDeriv(H_deriv, G_deriv); + + { // Make sure G_deriv is symmetric. Use H_deriv as a temporary. + H_deriv.CopyFromMat(*G_deriv); + G_deriv->AddMat(1.0, H_deriv, kTrans); + G_deriv->Scale(0.5); + } +} + +} // namespace differentiable_transform +} // namespace kaldi diff --git a/src/transform/differentiable-fmllr.h b/src/transform/differentiable-fmllr.h new file mode 100644 index 00000000000..5b8ae4445da --- /dev/null +++ b/src/transform/differentiable-fmllr.h @@ -0,0 +1,199 @@ +// transform/differentiable-fmllr.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ +#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ + +#include + +#include "base/kaldi-common.h" +#include "util/kaldi-table.h" +#include "util/kaldi-holder.h" +#include "matrix/matrix-functions.h" + +namespace kaldi { + + +namespace differentiable_transform { + + +// This header contains some utilities for implementing differentiable fMLLR. +// Since it is fairly complicated, we aren't putting all the implementation +// details in class FmllrTransform (in differentiable-transform.h), but +// segregating most of the technical stuff to this file. This also +// allows us to separate out the testing of individual components. +// The reference for things in this header is +// http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf. +// The notation we are using corresponds to the notation used in +// the "Summary" section of that document. + + + +/** + With reference to the notation in + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf, + this class implements the operation that takes G and K as input (and the + count gamma), and produces A. This has been separated into its own object + for purposes of testability. + */ + + +struct CoreFmllrEstimatorOptions { + + // singular_value_relative_floor is floor that we apply on the + // singular values of the inputs G and K, to ensure that no NaN's are + // generated in the forward pass and to prevent the derivatives + // in the backprop from becoming undefined. It affects both + // the forward and backward computations. A warning will be printed + // if this floor actually had an effect. + // Must be greater than zero (to avoid the possibility of generating + // NaN's). + BaseFloat singular_value_relative_floor; + + CoreFmllrEstimatorOptions(): + singular_value_relative_floor(0.001) { } +}; + + +class CoreFmllrEstimator { + public: + /** + Constructor. Does not do any real work. This class will store + references/pointers to G, K and A, so you need to make sure that + those quantities exist for the lifetime of this object. + + @param [in] opts Options class; see its definition for details. Will be copied + in the constructor. + @param [in] gamma The total data-count (often this will be the number of frames). + @param [in] G A symmetric matrix containing the quadratic + stats for estimating A. This the sum of outer products + of the input features, after mean subtraction, and + weighted by the inverse-variance factor s_i. Must be + positive definite for this computation to be well + defined. + @param [in] K A matrix containing the linear stats for estimating A. + This is a sum of outer products of the means with the + input features, with mean subtraction and inverse-variance + weighting. Must not have more than one zero singular value + for this computation to be well defined. + @param [in] A We mark this as an input parameter but it is the location + where the output of this computation will be placed when + you call Forward(). May be undefined (e.g., NaN) on + entry. You must not change the value of A between + calling Forward() and calling Backward(). + + TODO: introduc + */ + CoreFmllrEstimator(const CoreFmllrEstimatorOptions &opts, + BaseFloat gamma, + const MatrixBase &G, + const MatrixBase &K, + MatrixBase *A); + + /** + Does the forward pass of estimation. Writes to the location + 'A' that was passed to the constructor. + + Returns the objective-function improvement per frame, as compared + with what the objective-function would be with unit A. This equals + the total objective function improvement divided by gamma. + */ + BaseFloat Forward(); + + + /** + Does the backward pass. + @param [in] A_deriv The derivative of the objective + function (say, f) w.r.t. the output A (which was passed as a + pointer to the constructor). + @param [out] G_deriv A pointer to a location where the + derivative df/dG will be written. Will be added to, so + should contain zero (or some other defined value) + at input. + @param [out] K_deriv A pointer to a location where the + derivative df/dK will be written (so the i,j'th + element is the derivative w.r.t. the i,j'th element + of the input matrix K. + */ + void Backward(const MatrixBase &A_deriv, + Matrix *G_deriv, + Matrix *K_deriv); + + private: + // Computes H = G^{-0.5} + void ComputeH(); + // Compute L = K H + void ComputeL(); + // Compute B = F(L), where F is the + // function that takes the singular values of L, puts them through the function + // f(lamba) = (lambda + sqrt(lambda^2 + 4 gamma)) / 2. + void ComputeB(); + // Computes A = B H. + void ComputeA(); + + + // Backprops through the operation "A = B H". B_deriv and H_deriv + // must be free of NaN and inf on entry. + void BackpropA(const MatrixBase &A_deriv, + MatrixBase *B_deriv, + MatrixBase *H_deriv); + + // Backprops through the function "L = K H".. + // K_deriv must be free of NaN and inf on entry, but otherwise + // its value is ignored. H_deriv is added to by this function. + void BackpropL(const MatrixBase &L_deriv, + MatrixBase *K_deriv, + MatrixBase *H_deriv); + + // returns the objective-function change (vs. A being the unit matrix) from + // this estimation. + BaseFloat ComputeObjfChange(); + + CoreFmllrEstimatorOptions opts_; + BaseFloat gamma_; + const MatrixBase &G_; + const MatrixBase &K_; + MatrixBase *A_; + + // H = G^{-0.5} is symmetric. + Matrix H_; + // L = K H. + Matrix L_; + // B = F(L) is the result of applying SvdRescaler with + // the function f(lambda) = ((lambda + sqrt(lambda^2 + 4 gamma)) / 2) + Matrix B_; + + // Object that helps us to compute, and to backprop through the + // computation of, H = G^{-0.5}. + SvdRescaler G_rescaler_; + + // Object that helps us to compute, and to backprop through the computation + // of: B = F(L), where F is the function that takes the singular values of L, + // puts them through the function f(lamba) = (lambda + sqrt(lambda^2 + 4 + // gamma)) / 2. + SvdRescaler L_rescaler_; + +}; + + +} // namespace differentiable_transform +} // namespace kaldi + +#endif // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ From 52a6a31dc322d97a212bd60880a3abc06a498aeb Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 19 Nov 2018 22:30:11 -0500 Subject: [PATCH 12/87] [src] small changes --- src/matrix/matrix-functions.h | 3 +++ src/transform/Makefile | 8 +++++--- src/transform/differentiable-fmllr-test.cc | 18 +++++++++++++++++- src/transform/differentiable-fmllr.h | 5 ++++- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 10ba536f0d7..5808d078639 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -255,6 +255,9 @@ class SvdRescaler { // given that dg/d(output) is provided in 'output_deriv'. // This derivative is *added* to 'input_deriv', so you need // to zero 'input_deriv' or otherwise set it, beforehand. + // It is acceptable to call ComputeInputDeriv (with possibly different + // values of 'output_deriv' and 'input_deriv' as many times as you want, + // on the same object. void ComputeInputDeriv(const MatrixBase &output_deriv, MatrixBase *input_deriv); diff --git a/src/transform/Makefile b/src/transform/Makefile index a265db6ac37..67e5b78fb10 100644 --- a/src/transform/Makefile +++ b/src/transform/Makefile @@ -4,17 +4,19 @@ include ../kaldi.mk TESTFILES = regtree-fmllr-diag-gmm-test lda-estimate-test \ regression-tree-test fmllr-diag-gmm-test \ - regtree-mllr-diag-gmm-test fmpe-test fmllr-raw-test + regtree-mllr-diag-gmm-test fmpe-test fmllr-raw-test \ + differentiable-fmllr-test OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \ regtree-fmllr-diag-gmm.o cmvn.o transform-common.o fmllr-diag-gmm.o \ lvtln.o mllt.o fmpe.o basis-fmllr-diag-gmm.o \ - compressed-transform-stats.o fmllr-raw.o decodable-am-diag-gmm-regtree.o + compressed-transform-stats.o fmllr-raw.o decodable-am-diag-gmm-regtree.o \ + differentiable-fmllr.o LIBNAME = kaldi-transform ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/transform/differentiable-fmllr-test.cc b/src/transform/differentiable-fmllr-test.cc index 96b50c49cde..1846c8ffdb6 100644 --- a/src/transform/differentiable-fmllr-test.cc +++ b/src/transform/differentiable-fmllr-test.cc @@ -15,7 +15,7 @@ // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and -// limitations under the License. +//1 limitations under the License. #include "transform/differentiable-fmllr.h" @@ -23,6 +23,20 @@ namespace kaldi { namespace differentiable_transform { + +// Test derivatives produced by the Estimator object. +// +void TestCoreFmllrEstimatorDerivs( + BaseFloat gamma, + const Matrix &G, + const Matrix &K, + const Matrix &A, + CoreFmllrEstimator *estimator) { + // TODO. + +} + + void UnitTestCoreFmllrEstimatorSimple() { int32 dim = RandInt(10, 20); BaseFloat gamma = RandInt(5, 10); @@ -34,6 +48,8 @@ void UnitTestCoreFmllrEstimatorSimple() { CoreFmllrEstimator estimator(opts, gamma, G, K, &A); estimator.Forward(); KALDI_LOG << "A is " << A; + KALDI_ASSERT(A.IsUnit(0.01)); + TestCoreFmllrEstimatorDerivs(G, K, A, &estimator); } diff --git a/src/transform/differentiable-fmllr.h b/src/transform/differentiable-fmllr.h index 5b8ae4445da..85c65ffdf02 100644 --- a/src/transform/differentiable-fmllr.h +++ b/src/transform/differentiable-fmllr.h @@ -119,7 +119,10 @@ class CoreFmllrEstimator { /** - Does the backward pass. + Does the backward pass. Note: it is permissible to call + Backward() any number of times, it does not have to be called + exactly once. + @param [in] A_deriv The derivative of the objective function (say, f) w.r.t. the output A (which was passed as a pointer to the constructor). From e8ffbbff8cb7e168d7e87d259b6c9af8c250936c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 19 Nov 2018 23:14:56 -0500 Subject: [PATCH 13/87] [src] Add more testing code --- src/transform/differentiable-fmllr-test.cc | 50 +++++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/src/transform/differentiable-fmllr-test.cc b/src/transform/differentiable-fmllr-test.cc index 1846c8ffdb6..4ea12936997 100644 --- a/src/transform/differentiable-fmllr-test.cc +++ b/src/transform/differentiable-fmllr-test.cc @@ -24,16 +24,50 @@ namespace differentiable_transform { -// Test derivatives produced by the Estimator object. -// -void TestCoreFmllrEstimatorDerivs( +// Test derivatives produced by the Estimator object for K. +void TestCoreFmllrEstimatorKDeriv( BaseFloat gamma, const Matrix &G, const Matrix &K, const Matrix &A, CoreFmllrEstimator *estimator) { - // TODO. + int32 num_directions = 4; + Vector expected_changes(num_directions), + actual_changes(num_directions); + + int32 dim = G.NumRows(); + BaseFloat epsilon = 1.0e-04 * gamma; + Matrix A_deriv(dim, dim); + // A_deriv defines the objective function: a random linear function in A. + A_deriv.SetRandn(); + + Matrix G_deriv(dim, dim), + K_deriv(dim, dim); + estimator->Backward(A_deriv, &G_deriv, &K_deriv); + + for (int32 i = 0; i < num_directions; i++) { + Matrix K_new(dim, dim); + K_new.SetRandn(); + K_new.Scale(epsilon); + expected_changes(i) = TraceMatMat(K_new, K_deriv, kTrans); + K_new.AddMat(1.0, K); + CoreFmllrEstimatorOptions opts; + Matrix A_new(dim, dim); + CoreFmllrEstimator estimator2(opts, gamma, G, K_new, &A_new); + estimator2.Forward(); + A_new.AddMat(-1.0, A); + // compute the change in our random linear objective function defined by + // A_deriv, that would be produced by taking some small random change in K + // and computing the A that results from that. + actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); + } + + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } } @@ -46,10 +80,14 @@ void UnitTestCoreFmllrEstimatorSimple() { K.AddToDiag(0.234 * gamma); CoreFmllrEstimatorOptions opts; CoreFmllrEstimator estimator(opts, gamma, G, K, &A); - estimator.Forward(); + BaseFloat objf_impr = estimator.Forward(); KALDI_LOG << "A is " << A; KALDI_ASSERT(A.IsUnit(0.01)); - TestCoreFmllrEstimatorDerivs(G, K, A, &estimator); + KALDI_ASSERT(fabs(objf_impr) < 0.01); + for (int32 i = 0; i < 5; i++) { + TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); + // TestCoreFmllrEstimatorGDeriv(G, K, A, &estimator); + } } From de65be289f4f5a8c4a166ae32ba1fda68a1c11f6 Mon Sep 17 00:00:00 2001 From: GaofengCheng Date: Tue, 20 Nov 2018 13:43:32 +0800 Subject: [PATCH 14/87] fix --- src/matrix/matrix-functions.cc | 16 +++++++++++++--- src/matrix/matrix-functions.h | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 1e801324fd0..ade94f6649e 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -768,8 +768,8 @@ void AddOuterProductPlusMinus(double alpha, const VectorBase &b, MatrixBase *plus, MatrixBase *minus); - -SvdRescaler::SvdRescaler(const matrixBase &A, bool symmetric = false): +/* +SvdRescaler::SvdRescaler(const MatrixBase &A, bool symmetric = false): input_matrix_A_(A), symmetric_(symmetric) { int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), @@ -783,10 +783,20 @@ SvdRescaler::SvdRescaler(const matrixBase &A, bool symmetric = false) U_ = U; Vt_ = Vt; } - +*/ void SvdRescaler::Init(const MatrixBase *A, bool symmetric = false) { *input_matrix_A_ = A; symmetric_ = symmetric; + int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), + rc_min = std::min(rows, cols); + Vector s(rc_min); // singular value vector + Matrix U(rows, rc_min), Vt(rc_min, cols); + input_matrix_A_.DestructiveSvd(&s, &U, &Vt); + SortSvd(&s, &U, &Vt); + lambda_in_ = s; + *lambda_out_ = s; + U_ = U; + Vt_ = Vt; } Vectorbase &SvdRescaler::InputSingularValues() { diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 058c3416e3a..36f5acd78ae 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -213,7 +213,7 @@ class SvdRescaler { special case of SVD. */ SvdRescaler(const MatrixBase &A, - bool symmetric = false); + bool symmetric = false): input_matrix_A_(A), symmetric_(symmetric) {} // Constructor that takes no args. In this case you are supposed to // call Init() From afff449e9d6b342e2092f6d9b55621addab8a36a Mon Sep 17 00:00:00 2001 From: GaofengCheng Date: Tue, 20 Nov 2018 17:08:30 +0800 Subject: [PATCH 15/87] fix --- src/matrix/matrix-functions.cc | 23 ++++++++++++++--------- src/matrix/matrix-functions.h | 11 ++++------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index ade94f6649e..6a97916a312 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -768,10 +768,11 @@ void AddOuterProductPlusMinus(double alpha, const VectorBase &b, MatrixBase *plus, MatrixBase *minus); -/* -SvdRescaler::SvdRescaler(const MatrixBase &A, bool symmetric = false): - input_matrix_A_(A), - symmetric_(symmetric) { + +SvdRescaler::SvdRescaler(const MatrixBase &A, + bool symmetric): + input_matrix_A_(A), + symmetric_(symmetric) { int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), rc_min = std::min(rows, cols); Vector s(rc_min); // singular value vector @@ -782,11 +783,15 @@ SvdRescaler::SvdRescaler(const MatrixBase &A, bool symmetric = false) *lambda_out_ = s; U_ = U; Vt_ = Vt; +} + +void SvdRescaler::Init(const MatrixBase *A, bool symmetric) { + input_matrix_A_ = *A; + if (symmetric) { + symmetric_ = symmetric; + } else { + symmetric_ = false; } -*/ -void SvdRescaler::Init(const MatrixBase *A, bool symmetric = false) { - *input_matrix_A_ = A; - symmetric_ = symmetric; int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), rc_min = std::min(rows, cols); Vector s(rc_min); // singular value vector @@ -799,7 +804,7 @@ void SvdRescaler::Init(const MatrixBase *A, bool symmetric = false) { Vt_ = Vt; } -Vectorbase &SvdRescaler::InputSingularValues() { +VectorBase &SvdRescaler::InputSingularValues() { return lambda_in_; } diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index f6c1141c7e2..de668ccf0b9 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -212,8 +212,7 @@ class SvdRescaler { instead of A = U diag(s) V^T, using SpMatrix::Eig(). You can view this as a special case of SVD. */ - SvdRescaler(const MatrixBase &A, - bool symmetric = false): input_matrix_A_(A), symmetric_(symmetric) {} + SvdRescaler(const MatrixBase &A, bool symmetric); // Constructor that takes no args. In this case you are supposed to // call Init() @@ -223,8 +222,7 @@ class SvdRescaler { // directly after initializing the object with no args. Warning: this object // keeps a reference to this matrix, so don't modify it during the lifetime // of this object. - void Init(const MatrixBase *A, - bool symmetric = false); + void Init(const MatrixBase *A, bool symmetric); // Get the singular values of A, which will have been computed in the // constructor. The reason why this is not const is that there may be @@ -260,9 +258,8 @@ class SvdRescaler { // on the same object. void ComputeInputDeriv(const MatrixBase &output_deriv, MatrixBase *input_deriv); - - private: - MatrixBase *input_matrix_A_; + protected: + Matrix input_matrix_A_; bool symmetric_; MatrixBase U_, Vt_; VectorBase lambda_in_; From 96771f62a9dbb74194594aa75b48ab466ab813ac Mon Sep 17 00:00:00 2001 From: GaofengCheng Date: Tue, 20 Nov 2018 19:45:52 +0800 Subject: [PATCH 16/87] No compiling problem remaining unless the constructor --- src/matrix/matrix-functions.cc | 57 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 6a97916a312..7937dd51285 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -768,7 +768,7 @@ void AddOuterProductPlusMinus(double alpha, const VectorBase &b, MatrixBase *plus, MatrixBase *minus); - +/* SvdRescaler::SvdRescaler(const MatrixBase &A, bool symmetric): input_matrix_A_(A), @@ -784,7 +784,7 @@ SvdRescaler::SvdRescaler(const MatrixBase &A, U_ = U; Vt_ = Vt; } - +*/ void SvdRescaler::Init(const MatrixBase *A, bool symmetric) { input_matrix_A_ = *A; if (symmetric) { @@ -798,61 +798,62 @@ void SvdRescaler::Init(const MatrixBase *A, bool symmetric) { Matrix U(rows, rc_min), Vt(rc_min, cols); input_matrix_A_.DestructiveSvd(&s, &U, &Vt); SortSvd(&s, &U, &Vt); - lambda_in_ = s; - *lambda_out_ = s; - U_ = U; - Vt_ = Vt; + lambda_in_.CopyFromVec(s); + lambda_out_->CopyFromVec(s); + lambda_out_deriv_->CopyFromVec(s); + U_.CopyFromMat(U); + Vt_.CopyFromMat(Vt); } VectorBase &SvdRescaler::InputSingularValues() { return lambda_in_; } -VectorBase *SvdRescaler::OutputSingularvalues() { +VectorBase *SvdRescaler::OutputSingularValues() { return lambda_out_; } -VectorBase *SvdRescaler::OutputSingularvaluesDerivs() { +VectorBase *SvdRescaler::OutputSingularValueDerivs() { return lambda_out_deriv_; } void SvdRescaler::GetOutput(MatrixBase *output) { - KALDI_ASSERT(output->NumRows() == input_matrix_A_->NumRows() && - output->NumCols() == input_matrix_A_->NumCols()); - MatrixBase U_tmpt(U_); + KALDI_ASSERT(output->NumRows() == input_matrix_A_.NumRows() && + output->NumCols() == input_matrix_A_.NumCols()); + Matrix U_tmpt(U_); U_tmpt.MulColsVec(*lambda_out_); U_tmpt.AddMatMat(1.0, U_tmpt, kNoTrans, Vt_, kNoTrans, 0.0); - *output = U_tmpt; + output->CopyFromMat(U_tmpt); } void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, MatrixBase *input_deriv) { KALDI_ASSERT(output_deriv.NumRows() == U_.NumRows() && output_deriv.NumCols() == Vt_.NumRows() && - input_deriv.NumRows() == U_.NumRows() && - input_deriv.NumCols() == Vt_.NumRows() && + input_deriv->NumRows() == U_.NumRows() && + input_deriv->NumCols() == Vt_.NumRows() && U_.NumCols() == Vt_.NumRows()); // \bar{A} input_deriv->SetZero(); // \bar{D} - MatrixBase intermediate_deriv(U_.NumCols(), Vt_.NumCols()); - intermediate_deriv.AddmatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, + Matrix intermediate_deriv(U_.NumCols(), Vt_.NumCols()); + intermediate_deriv.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, Vt_, kNoTrans, 0.0); // some intermediate variables // store the diriv of {f'(\lambda_{i})}\times{\bar\d_{i,i}} - VectorBase diagonal_deriv_intermediate(U_.NumCols()); + Vector diagonal_deriv_intermediate(U_.NumCols()); diagonal_deriv_intermediate.SetZero(); diagonal_deriv_intermediate.CopyDiagFromMat(intermediate_deriv); diagonal_deriv_intermediate.MulElements(*lambda_out_deriv_); // store \lambda_{i} \times d_{i} - MatrixBase diagonal_deriv_intermediate2(U_.NumCols(), U_.NumCols()); + Matrix diagonal_deriv_intermediate2(U_.NumCols(), U_.NumCols()); diagonal_deriv_intermediate2.SetZero(); diagonal_deriv_intermediate2.AddVecVec(1.0, lambda_in_, *lambda_out_); // store \lambda_{i} \times \lambda_{i} - VectorBase diagonal_deriv_intermediate3(U_.NumCols()); + Vector diagonal_deriv_intermediate3(U_.NumCols()); diagonal_deriv_intermediate3.SetZero(); - diagonal_deriv_intermediate3.AddVec2(1.0, lambda_in_qstat); + diagonal_deriv_intermediate3.AddVec2(1.0, lambda_in_); for(MatrixIndexT i = 0; i < U_.NumCols(); i++) { @@ -860,26 +861,26 @@ void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, { // there may remain bugs! if ((lambda_in_(i) == 0.0) && (lambda_in_(j) == 0.0) && (i != j)) { - *input_deriv(i, j) = intermediate_deriv(i, j) * lambda_out_deriv_(i); + (*input_deriv)(i, j) = intermediate_deriv(i, j) * (*lambda_out_deriv_)(i); } else if ((i != j) && (lambda_in_(i) - lambda_in_(j) > 0.0000001)) { - *input_deriv(i, j) = intermediate_deriv(i, j) + (*input_deriv)(i, j) = intermediate_deriv(i, j) *(diagonal_deriv_intermediate2(i, i) - diagonal_deriv_intermediate2(j, j)) / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)) + intermediate_deriv(j, i) *(diagonal_deriv_intermediate2(j, i) - diagonal_deriv_intermediate2(i, j)) / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)); } else if ((i != j) && (lambda_in_(i) - lambda_in_(j) < 0.0000001)) { - float lambda_avg = (lambda_in_(i) + lambda_in_(jump)) / 2.0; - *input_deriv(i, j) = intermediate_deriv(i, j) - * (lambda_avg * (*lambda_out_deriv_(i)) + *lambda_out_(i)) + float lambda_avg = (lambda_in_(i) + lambda_in_(j)) / 2.0; + (*input_deriv)(i, j) = intermediate_deriv(i, j) + * (lambda_avg * ((*lambda_out_deriv_)(i)) + (*lambda_out_)(i)) / (2.0 * lambda_avg) + intermediate_deriv(j, i) - * (lambda_avg * (*lambda_out_deriv_(i)) - *lambda_out_(i)) + * (lambda_avg * ((*lambda_out_deriv_)(i)) - (*lambda_out_)(i)) / (2.0 * lambda_avg); } } } - input_deriv->CopyDiagFromMat(diagonal_deriv_intermediate); - input_deriv->AddMatMatMat(1.0, U_, kNoTrans, *input_deriv, kNoTrans, Vt_, kTrans); + input_deriv->CopyDiagFromVec(diagonal_deriv_intermediate); + input_deriv->AddMatMatMat(1.0, U_, kNoTrans, *input_deriv, kNoTrans, Vt_, kTrans, 0.0); } } // end namespace kaldi From 51e37119049387bdce56fc40ea53b458f2b6a25b Mon Sep 17 00:00:00 2001 From: GaofengCheng Date: Wed, 21 Nov 2018 17:04:35 +0800 Subject: [PATCH 17/87] fix --- src/matrix/matrix-functions.cc | 54 ++++++++++++++++++++-------------- src/matrix/matrix-functions.h | 1 + 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 7937dd51285..399ce5c1db0 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation; Go Vivace Inc.; Jan Silovsky // Yanmin Qian; Saarland University; Johns Hopkins University (Author: Daniel Povey) +// Gaofeng Cheng (Institute of Acoustics, Chinese Academy of Sciences) // See ../../COPYING for clarification regarding multiple authors // @@ -772,7 +773,8 @@ void AddOuterProductPlusMinus(double alpha, SvdRescaler::SvdRescaler(const MatrixBase &A, bool symmetric): input_matrix_A_(A), - symmetric_(symmetric) { + symmetric_(symmetric) + { int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), rc_min = std::min(rows, cols); Vector s(rc_min); // singular value vector @@ -786,14 +788,16 @@ SvdRescaler::SvdRescaler(const MatrixBase &A, } */ void SvdRescaler::Init(const MatrixBase *A, bool symmetric) { + KALDI_ASSERT(A->NumRows() >= A->NumCols()); input_matrix_A_ = *A; if (symmetric) { symmetric_ = symmetric; } else { symmetric_ = false; } - int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), - rc_min = std::min(rows, cols); + int32 rows = input_matrix_A_.NumRows(), + cols = input_matrix_A_.NumCols(), + rc_min = cols; Vector s(rc_min); // singular value vector Matrix U(rows, rc_min), Vt(rc_min, cols); input_matrix_A_.DestructiveSvd(&s, &U, &Vt); @@ -831,8 +835,7 @@ void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, KALDI_ASSERT(output_deriv.NumRows() == U_.NumRows() && output_deriv.NumCols() == Vt_.NumRows() && input_deriv->NumRows() == U_.NumRows() && - input_deriv->NumCols() == Vt_.NumRows() && - U_.NumCols() == Vt_.NumRows()); + input_deriv->NumCols() == Vt_.NumRows()); // \bar{A} input_deriv->SetZero(); @@ -840,17 +843,23 @@ void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, Matrix intermediate_deriv(U_.NumCols(), Vt_.NumCols()); intermediate_deriv.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, Vt_, kNoTrans, 0.0); + // some intermediate variables // store the diriv of {f'(\lambda_{i})}\times{\bar\d_{i,i}} + // as diagonal_deriv_intermediate Vector diagonal_deriv_intermediate(U_.NumCols()); diagonal_deriv_intermediate.SetZero(); diagonal_deriv_intermediate.CopyDiagFromMat(intermediate_deriv); diagonal_deriv_intermediate.MulElements(*lambda_out_deriv_); - // store \lambda_{i} \times d_{i} + + // store \lambda_{i} \times d_{j} + // as diagonal_deriv_intermediate2 Matrix diagonal_deriv_intermediate2(U_.NumCols(), U_.NumCols()); diagonal_deriv_intermediate2.SetZero(); diagonal_deriv_intermediate2.AddVecVec(1.0, lambda_in_, *lambda_out_); + // store \lambda_{i} \times \lambda_{i} + // as diagonal_deriv_intermediate3 Vector diagonal_deriv_intermediate3(U_.NumCols()); diagonal_deriv_intermediate3.SetZero(); diagonal_deriv_intermediate3.AddVec2(1.0, lambda_in_); @@ -859,24 +868,25 @@ void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, { for(MatrixIndexT j = 0; j < Vt_.NumCols(); i++) { - // there may remain bugs! if ((lambda_in_(i) == 0.0) && (lambda_in_(j) == 0.0) && (i != j)) { (*input_deriv)(i, j) = intermediate_deriv(i, j) * (*lambda_out_deriv_)(i); - } else if ((i != j) && (lambda_in_(i) - lambda_in_(j) > 0.0000001)) { - (*input_deriv)(i, j) = intermediate_deriv(i, j) - *(diagonal_deriv_intermediate2(i, i) - diagonal_deriv_intermediate2(j, j)) - / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)) - + intermediate_deriv(j, i) - *(diagonal_deriv_intermediate2(j, i) - diagonal_deriv_intermediate2(i, j)) - / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)); - } else if ((i != j) && (lambda_in_(i) - lambda_in_(j) < 0.0000001)) { - float lambda_avg = (lambda_in_(i) + lambda_in_(j)) / 2.0; - (*input_deriv)(i, j) = intermediate_deriv(i, j) - * (lambda_avg * ((*lambda_out_deriv_)(i)) + (*lambda_out_)(i)) - / (2.0 * lambda_avg) - + intermediate_deriv(j, i) - * (lambda_avg * ((*lambda_out_deriv_)(i)) - (*lambda_out_)(i)) - / (2.0 * lambda_avg); + } else if (i != j) { + if (abs((lambda_in_(i) - lambda_in_(j)) / lambda_in_(j)) > 0.0000001) { + (*input_deriv)(i, j) = intermediate_deriv(i, j) + *(diagonal_deriv_intermediate2(i, i) - diagonal_deriv_intermediate2(j, j)) + / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)) + + intermediate_deriv(j, i) + *(diagonal_deriv_intermediate2(j, i) - diagonal_deriv_intermediate2(i, j)) + / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)); + } else { + float lambda_avg = (lambda_in_(i) + lambda_in_(j)) / 2.0; + (*input_deriv)(i, j) = intermediate_deriv(i, j) + * (lambda_avg * ((*lambda_out_deriv_)(i)) + (*lambda_out_)(i)) + / (2.0 * lambda_avg) + + intermediate_deriv(j, i) + * (lambda_avg * ((*lambda_out_deriv_)(i)) - (*lambda_out_)(i)) + / (2.0 * lambda_avg); + } } } } diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index de668ccf0b9..68314b7ce2f 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -222,6 +222,7 @@ class SvdRescaler { // directly after initializing the object with no args. Warning: this object // keeps a reference to this matrix, so don't modify it during the lifetime // of this object. + // This program assumes the input matrix (num_rows >= num_cols). void Init(const MatrixBase *A, bool symmetric); // Get the singular values of A, which will have been computed in the From dfd9e5d3938123bfc98b21c48368882610d0b673 Mon Sep 17 00:00:00 2001 From: GaofengCheng Date: Wed, 21 Nov 2018 22:50:34 +0800 Subject: [PATCH 18/87] Add test --- src/matrix/Makefile | 2 +- src/matrix/matrix-functions-test.cc | 68 +++++++++++++++++++++++++++++ src/matrix/matrix-functions.cc | 7 ++- src/matrix/matrix-functions.h | 7 +-- 4 files changed, 76 insertions(+), 8 deletions(-) create mode 100644 src/matrix/matrix-functions-test.cc diff --git a/src/matrix/Makefile b/src/matrix/Makefile index e39be1ffec9..2fcf62fcb69 100644 --- a/src/matrix/Makefile +++ b/src/matrix/Makefile @@ -10,7 +10,7 @@ include ../kaldi.mk # you can uncomment matrix-lib-speed-test if you want to do the speed tests. -TESTFILES = matrix-lib-test sparse-matrix-test #matrix-lib-speed-test +TESTFILES = matrix-lib-test sparse-matrix-test matrix-functions-test #matrix-lib-speed-test OBJFILES = kaldi-matrix.o kaldi-vector.o packed-matrix.o sp-matrix.o tp-matrix.o \ matrix-functions.o qr.o srfft.o compressed-matrix.o \ diff --git a/src/matrix/matrix-functions-test.cc b/src/matrix/matrix-functions-test.cc new file mode 100644 index 00000000000..97c00b74c2b --- /dev/null +++ b/src/matrix/matrix-functions-test.cc @@ -0,0 +1,68 @@ +// matrix/matrix-functions-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) +// 2018 Institute of Acoustics, CAS (Gaofeng Cheng) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "matrix/matrix-functions.h" +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { +void SvdRescalerTestInit() { + int32 rows = 10, cols = 10; + Matrix mat(rows, cols); + mat.SetRandn(); + SvdRescaler sc; + sc.Init(&mat, false); + + VectorBase &vec1 = sc.InputSingularValues(); + VectorBase &vec2 = *sc.OutputSingularValues(), + &vec3 = *sc.OutputSingularValueDerivs(); + + KALDI_ASSERT(vec1.Dim() == vec2.Dim() && + vec2.Dim() == vec3.Dim() && + vec1.Max() == vec2.Max() && + vec2.Max() == vec3.Max() && + vec1.Min() == vec2.Min() && + vec2.Min() == vec3.Min()); +} + +void SvdRescalerTestWrite() { + int32 rows = 10, cols = 10; + Matrix mat(rows, cols); + mat.SetRandn(); + SvdRescaler sc; + sc.Init(&mat, false); + + VectorBase &vec1 = sc.InputSingularValues(); + VectorBase &vec2 = *sc.OutputSingularValues(), + &vec3 = *sc.OutputSingularValueDerivs(); + + for(int32 i = 0; i < rows; i++) + { + KALDI_ASSERT((vec1)(i) == (vec2)(i)); + } +} +} // namespace kaldi + +int main() { + + kaldi::SvdRescalerTestInit(); + kaldi::SvdRescalerTestWrite(); + std::cout << "Test OK.\n"; +} \ No newline at end of file diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 399ce5c1db0..7e37cdd2250 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -769,12 +769,11 @@ void AddOuterProductPlusMinus(double alpha, const VectorBase &b, MatrixBase *plus, MatrixBase *minus); -/* + SvdRescaler::SvdRescaler(const MatrixBase &A, bool symmetric): input_matrix_A_(A), - symmetric_(symmetric) - { + symmetric_(symmetric) { int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), rc_min = std::min(rows, cols); Vector s(rc_min); // singular value vector @@ -786,7 +785,7 @@ SvdRescaler::SvdRescaler(const MatrixBase &A, U_ = U; Vt_ = Vt; } -*/ + void SvdRescaler::Init(const MatrixBase *A, bool symmetric) { KALDI_ASSERT(A->NumRows() >= A->NumCols()); input_matrix_A_ = *A; diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 68314b7ce2f..02353dc458e 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -259,12 +259,13 @@ class SvdRescaler { // on the same object. void ComputeInputDeriv(const MatrixBase &output_deriv, MatrixBase *input_deriv); + protected: Matrix input_matrix_A_; bool symmetric_; - MatrixBase U_, Vt_; - VectorBase lambda_in_; - VectorBase *lambda_out_, *lambda_out_deriv_; + Matrix U_, Vt_; + Vector lambda_in_; + Vector *lambda_out_, *lambda_out_deriv_; }; /// @} end of "addtogroup matrix_funcs_misc" From 84f8d6265b78164cc5c234dba899b4c02c593623 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 21 Nov 2018 23:34:49 -0500 Subject: [PATCH 19/87] Fixes to SVD code --- src/matrix/kaldi-matrix.h | 4 + src/matrix/matrix-functions-test.cc | 62 ++++----- src/matrix/matrix-functions.cc | 198 +++++++++++++--------------- src/matrix/matrix-functions.h | 48 ++++--- 4 files changed, 156 insertions(+), 156 deletions(-) diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index a973824128c..ed162c20ce4 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -529,6 +529,10 @@ class MatrixBase { * positive semi-definite (check_thresh controls how stringent the check is; * set it to 2 to ensure it won't ever complain, but it will zero out negative * dimensions in your matrix. + * + * Caution: if you want the eigenvalues, it may make more sense to convert to + * SpMatrix and use Eig() function there, which uses eigenvalue decomposition + * directly rather than SVD. */ void SymPosSemiDefEig(VectorBase *s, MatrixBase *P, Real check_thresh = 0.001); diff --git a/src/matrix/matrix-functions-test.cc b/src/matrix/matrix-functions-test.cc index 97c00b74c2b..fa66e2f3f16 100644 --- a/src/matrix/matrix-functions-test.cc +++ b/src/matrix/matrix-functions-test.cc @@ -23,46 +23,42 @@ #include "matrix/kaldi-matrix.h" namespace kaldi { -void SvdRescalerTestInit() { - int32 rows = 10, cols = 10; - Matrix mat(rows, cols); + +void SvdRescalerTestIdentity() { + // this tests the case where f() is the identity function. + int32 dim = 10; + Matrix mat(dim, dim); + if (RandInt(0, 1) == 0) mat.SetRandn(); - SvdRescaler sc; - sc.Init(&mat, false); + // else zero. - VectorBase &vec1 = sc.InputSingularValues(); - VectorBase &vec2 = *sc.OutputSingularValues(), - &vec3 = *sc.OutputSingularValueDerivs(); + SvdRescaler sc; + sc.Init(&mat, false); - KALDI_ASSERT(vec1.Dim() == vec2.Dim() && - vec2.Dim() == vec3.Dim() && - vec1.Max() == vec2.Max() && - vec2.Max() == vec3.Max() && - vec1.Min() == vec2.Min() && - vec2.Min() == vec3.Min()); + BaseFloat *lambda = sc.InputSingularValues(), + *f_lambda= sc.OutputSingularValues(), + *fprime_lambda = sc.OutputSingularValueDerivs(); + for (int32 i = 0; i < dim; i++) { + f_lambda[i] = lambda[i]; + fprime_lambda[i] = 1.0; + } + Matrix output(dim, dim, kUndefined); + sc.GetOutput(&output); + AssertEqual(mat, output, 0.001); + Matrix output_deriv(dim, dim, kUndefined), + input_deriv(dim, dim); + output_deriv.SetRandn(); + sc.ComputeInputDeriv(output_deriv, &input_deriv); + KALDI_LOG << output_deriv << input_deriv; + AssertEqual(output_deriv, input_deriv); } -void SvdRescalerTestWrite() { - int32 rows = 10, cols = 10; - Matrix mat(rows, cols); - mat.SetRandn(); - SvdRescaler sc; - sc.Init(&mat, false); - - VectorBase &vec1 = sc.InputSingularValues(); - VectorBase &vec2 = *sc.OutputSingularValues(), - &vec3 = *sc.OutputSingularValueDerivs(); - for(int32 i = 0; i < rows; i++) - { - KALDI_ASSERT((vec1)(i) == (vec2)(i)); - } -} } // namespace kaldi int main() { - - kaldi::SvdRescalerTestInit(); - kaldi::SvdRescalerTestWrite(); + for (int32 i = 0; i < 10; i++) { + kaldi::SvdRescalerTestIdentity(); + } std::cout << "Test OK.\n"; -} \ No newline at end of file +} diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 7e37cdd2250..0475e1f8b3c 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -770,126 +770,110 @@ void AddOuterProductPlusMinus(double alpha, MatrixBase *plus, MatrixBase *minus); -SvdRescaler::SvdRescaler(const MatrixBase &A, - bool symmetric): - input_matrix_A_(A), - symmetric_(symmetric) { - int32 rows = input_matrix_A_.NumRows(), cols = input_matrix_A_.NumCols(), - rc_min = std::min(rows, cols); - Vector s(rc_min); // singular value vector - Matrix U(rows, rc_min), Vt(rc_min, cols); - input_matrix_A_.DestructiveSvd(&s, &U, &Vt); - SortSvd(&s, &U, &Vt); - lambda_in_ = s; - *lambda_out_ = s; - U_ = U; - Vt_ = Vt; -} - void SvdRescaler::Init(const MatrixBase *A, bool symmetric) { - KALDI_ASSERT(A->NumRows() >= A->NumCols()); - input_matrix_A_ = *A; - if (symmetric) { - symmetric_ = symmetric; - } else { - symmetric_ = false; - } - int32 rows = input_matrix_A_.NumRows(), - cols = input_matrix_A_.NumCols(), - rc_min = cols; - Vector s(rc_min); // singular value vector - Matrix U(rows, rc_min), Vt(rc_min, cols); - input_matrix_A_.DestructiveSvd(&s, &U, &Vt); - SortSvd(&s, &U, &Vt); - lambda_in_.CopyFromVec(s); - lambda_out_->CopyFromVec(s); - lambda_out_deriv_->CopyFromVec(s); - U_.CopyFromMat(U); - Vt_.CopyFromMat(Vt); + KALDI_ASSERT(A->NumRows() == A->NumCols()); + A_ = A; + symmetric_ = symmetric; + int32 dim = A->NumRows(); + lambdas_.Resize(3, dim, kUndefined); + U_.Resize(dim, dim, kUndefined); + SubVector lambda(lambdas_, 0); + if (symmetric) { + // the following constructor will check that A is actually symmetric. + SpMatrix A_sym(*A_, kTakeMeanAndCheck); + A_sym.Eig(&lambda, &U_); + } else { + Vt_.Resize(dim, dim, kUndefined); + A_->Svd(&lambda, &U_, &Vt_); + } } -VectorBase &SvdRescaler::InputSingularValues() { - return lambda_in_; +BaseFloat *SvdRescaler::InputSingularValues() { + return lambdas_.RowData(0); } -VectorBase *SvdRescaler::OutputSingularValues() { - return lambda_out_; +BaseFloat *SvdRescaler::OutputSingularValues() { + return lambdas_.RowData(1); } -VectorBase *SvdRescaler::OutputSingularValueDerivs() { - return lambda_out_deriv_; +BaseFloat *SvdRescaler::OutputSingularValueDerivs() { + return lambdas_.RowData(2); } void SvdRescaler::GetOutput(MatrixBase *output) { - KALDI_ASSERT(output->NumRows() == input_matrix_A_.NumRows() && - output->NumCols() == input_matrix_A_.NumCols()); - Matrix U_tmpt(U_); - U_tmpt.MulColsVec(*lambda_out_); - U_tmpt.AddMatMat(1.0, U_tmpt, kNoTrans, Vt_, kNoTrans, 0.0); - output->CopyFromMat(U_tmpt); + int32 dim = A_->NumRows(); + SubVector f_lambda(lambdas_, 1); // f(lambda) in the writeup. + if (symmetric_) { + SpMatrix S(dim); + S.AddMat2Vec(1.0, U_, kNoTrans, f_lambda, 0.0); + output->CopyFromSp(S); + } else { + Matrix U_tmp(U_); + U_tmp.MulColsVec(f_lambda); + output->SetZero(); + output->AddMatMat(1.0, U_tmp, kNoTrans, Vt_, kNoTrans, 0.0); + } } void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, - MatrixBase *input_deriv) { - KALDI_ASSERT(output_deriv.NumRows() == U_.NumRows() && - output_deriv.NumCols() == Vt_.NumRows() && - input_deriv->NumRows() == U_.NumRows() && - input_deriv->NumCols() == Vt_.NumRows()); - // \bar{A} - input_deriv->SetZero(); - - // \bar{D} - Matrix intermediate_deriv(U_.NumCols(), Vt_.NumCols()); - intermediate_deriv.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, - Vt_, kNoTrans, 0.0); - - // some intermediate variables - // store the diriv of {f'(\lambda_{i})}\times{\bar\d_{i,i}} - // as diagonal_deriv_intermediate - Vector diagonal_deriv_intermediate(U_.NumCols()); - diagonal_deriv_intermediate.SetZero(); - diagonal_deriv_intermediate.CopyDiagFromMat(intermediate_deriv); - diagonal_deriv_intermediate.MulElements(*lambda_out_deriv_); - - // store \lambda_{i} \times d_{j} - // as diagonal_deriv_intermediate2 - Matrix diagonal_deriv_intermediate2(U_.NumCols(), U_.NumCols()); - diagonal_deriv_intermediate2.SetZero(); - diagonal_deriv_intermediate2.AddVecVec(1.0, lambda_in_, *lambda_out_); - - // store \lambda_{i} \times \lambda_{i} - // as diagonal_deriv_intermediate3 - Vector diagonal_deriv_intermediate3(U_.NumCols()); - diagonal_deriv_intermediate3.SetZero(); - diagonal_deriv_intermediate3.AddVec2(1.0, lambda_in_); - - for(MatrixIndexT i = 0; i < U_.NumCols(); i++) - { - for(MatrixIndexT j = 0; j < Vt_.NumCols(); i++) - { - if ((lambda_in_(i) == 0.0) && (lambda_in_(j) == 0.0) && (i != j)) { - (*input_deriv)(i, j) = intermediate_deriv(i, j) * (*lambda_out_deriv_)(i); - } else if (i != j) { - if (abs((lambda_in_(i) - lambda_in_(j)) / lambda_in_(j)) > 0.0000001) { - (*input_deriv)(i, j) = intermediate_deriv(i, j) - *(diagonal_deriv_intermediate2(i, i) - diagonal_deriv_intermediate2(j, j)) - / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)) - + intermediate_deriv(j, i) - *(diagonal_deriv_intermediate2(j, i) - diagonal_deriv_intermediate2(i, j)) - / (diagonal_deriv_intermediate3(i) - diagonal_deriv_intermediate3(j)); - } else { - float lambda_avg = (lambda_in_(i) + lambda_in_(j)) / 2.0; - (*input_deriv)(i, j) = intermediate_deriv(i, j) - * (lambda_avg * ((*lambda_out_deriv_)(i)) + (*lambda_out_)(i)) - / (2.0 * lambda_avg) - + intermediate_deriv(j, i) - * (lambda_avg * ((*lambda_out_deriv_)(i)) - (*lambda_out_)(i)) - / (2.0 * lambda_avg); - } - } - } + MatrixBase *input_deriv) const { + int32 dim = A_->NumRows(); + KALDI_ASSERT(output_deriv.NumRows() == dim && output_deriv.NumCols() == dim && + input_deriv->NumRows() == dim && input_deriv->NumCols() == dim); + // input_deriv is \bar{A} in the writeup. + input_deriv->SetZero(); + + // \bar{D} in the writeup; see class declaration. + Matrix bar_d(dim, dim); + bar_d.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, Vt_, kTrans, 0.0); + + Matrix bar_lambda(dim, dim); + + const BaseFloat *lambda = lambdas_.RowData(0), // elements \lambda_i + *f_lambda = lambdas_.RowData(1), // elements f(\lambda_i) + *f_lambda_deriv = lambdas_.RowData(2); // elements f'(lambda_i) + + // we use doubles in the computations below, to avoid underflow if any floating + // point values were extremely close to zero (e.g., denormal) + for(int32 i = 0; i < dim; i++) { + double lambda_i = lambda[i], lambda2_i = lambda_i * lambda_i, + d_i = f_lambda[i]; + for(int32 j = 0; j < dim; j++) { + double lambda_j = lambda[j], lambda2_j = lambda_j * lambda_j, + d_j = f_lambda[j], bar_d_ij = bar_d(i, j), + bar_d_ji = bar_d(j, i), bar_lambda_ij; + // if lambda_i and lambda_j are not (relatively) too close in value (which + // implies that at least one them is nonzero).. + if (std::abs(lambda_i - lambda_j) > 1.0e-03 * std::abs(lambda_i)) { + bar_lambda_ij = bar_d_ij * ((lambda_i * d_i - lambda_j * d_j) / + (lambda2_i - lambda2_j)) + + bar_d_ji * ((lambda_j * d_i - lambda_i * d_j) / + (lambda2_i - lambda2_j)); + } else if (lambda_i != 0) { + // If we reached here, it implies they are both nonzero, but extremely + // close in value. + // lambda is the average of the two lambdas. + // Assume f'(lambda) is the average of the two derivatives. + double lambda = 0.5 * (lambda_i + lambda_j), + f_prime_lambda = 0.5 * (f_lambda_deriv[i] + f_lambda_deriv[j]), + d = 0.5 * (d_i + d_j); + bar_lambda_ij = bar_d_ij * ((lambda * f_prime_lambda + d) / (2.0 * lambda)) + + bar_d_ji * ((lambda * f_prime_lambda - d) / (2.0 * lambda)); + } else { + // both zero. + KALDI_ASSERT(lambda_i == 0 && lambda_j == 0); + bar_lambda_ij = bar_d_ij * f_lambda_deriv[i]; + } + bar_lambda(i, j) = bar_lambda_ij; } - input_deriv->CopyDiagFromVec(diagonal_deriv_intermediate); - input_deriv->AddMatMatMat(1.0, U_, kNoTrans, *input_deriv, kNoTrans, Vt_, kTrans, 0.0); + } + if (!symmetric_) { + input_deriv->AddMatMatMat(1.0, U_, kNoTrans, bar_lambda, kNoTrans, + Vt_, kNoTrans, 0.0); + } else { + input_deriv->AddMatMatMat(1.0, U_, kNoTrans, bar_lambda, kNoTrans, + U_, kTrans, 0.0); + } } + } // end namespace kaldi diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 02353dc458e..2b3ec8133e9 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -169,8 +169,8 @@ inline void AssertSameDim(const MatrixBase &mat1, const MatrixBase http://www.danielpovey.com/files/2018_svd_derivative.pdf and to backprop through that computation. Short summary: it allows you to apply some kind of scalar function - to the singular values of a matrix, reconstruct it, and then backprop - through that operation. + to the singular values of a square matrix, reconstruct it, and then + backprop through that operation. This class is quite general-purpose in the sense that you can provide any scalar function; but in order to avoid things like @@ -212,21 +212,24 @@ class SvdRescaler { instead of A = U diag(s) V^T, using SpMatrix::Eig(). You can view this as a special case of SVD. */ - SvdRescaler(const MatrixBase &A, bool symmetric); + SvdRescaler(const MatrixBase *A, bool symmetric) { + Init(A, symmetric); + } // Constructor that takes no args. In this case you are supposed to // call Init() - SvdRescaler(); + SvdRescaler() { } // An alternative to the constructor that takes args. Should only be called // directly after initializing the object with no args. Warning: this object // keeps a reference to this matrix, so don't modify it during the lifetime // of this object. - // This program assumes the input matrix (num_rows >= num_cols). + // A is required to be square. void Init(const MatrixBase *A, bool symmetric); - // Get the singular values of A, which will have been computed in the - // constructor. The reason why this is not const is that there may be + // Return a pointer to the the singular values of A, which will have been + // computed in the constructor. + // The reason why this is not const is that there may be // situations where you discover that the input matrix has some very small // singular values, and you want to (say) floor them somehow and reconstruct, // and have the derivatives be valid assuming you had given that 'repaired' @@ -234,15 +237,15 @@ class SvdRescaler { // a way to do that, although currently this class doesn't provide a way // for you to access that 'fixed-up' A directly. // We hope you know what you are doing if you modify these singular values. - VectorBase &InputSingularValues(); + BaseFloat *InputSingularValues(); // Returns a pointer to a place that you can write the // modified singular values f(lambda). - VectorBase *OutputSingularValues(); + BaseFloat *OutputSingularValues(); // Returns a pointer to a place that you can write the // values of f'(lambda) (the function-derivative of f). - VectorBase *OutputSingularValueDerivs(); + BaseFloat *OutputSingularValueDerivs(); // Outputs F(A) to 'output', which must have the correct size. // It's OK if 'output' contains NaNs on entry. @@ -258,14 +261,27 @@ class SvdRescaler { // values of 'output_deriv' and 'input_deriv' as many times as you want, // on the same object. void ComputeInputDeriv(const MatrixBase &output_deriv, - MatrixBase *input_deriv); + MatrixBase *input_deriv) const; protected: - Matrix input_matrix_A_; - bool symmetric_; - Matrix U_, Vt_; - Vector lambda_in_; - Vector *lambda_out_, *lambda_out_deriv_; + // the input matrix A. Owned by the user but will not be changed by them + // during the lifetime of this object. + const MatrixBase *A_; + bool symmetric_; + // U_ is present regardless of whether symmetric_ is true. It is the + // left part of the decomposition A = U diag(s) V^T. + Matrix U_; + // Vt_ is only present if symmetric_ is false. Otherwise, we + // assume that Vt_ equals U_. + Matrix Vt_; + + // a matrix containing three rows, and num-cols equal to the num-rows of the + // symmetric matrix A_. + // row 0 is 'lambda_in' (the input singular values; or the input eigenvalues, + // in the symmetric case). + // row 1 is 'lambda_out' (the input singular values, i.e. f(lambda)), + // row 2 is 'lambda_out_deriv' (the function-derivative f'(lambda)). + Matrix lambdas_; }; /// @} end of "addtogroup matrix_funcs_misc" From 0ce47b61904a6b22e189ad21f28c011139c4cb39 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 21 Nov 2018 23:49:30 -0500 Subject: [PATCH 20/87] [src] Further test functions added. --- src/matrix/matrix-functions-test.cc | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/matrix/matrix-functions-test.cc b/src/matrix/matrix-functions-test.cc index fa66e2f3f16..464b46a0e3e 100644 --- a/src/matrix/matrix-functions-test.cc +++ b/src/matrix/matrix-functions-test.cc @@ -53,12 +53,49 @@ void SvdRescalerTestIdentity() { AssertEqual(output_deriv, input_deriv); } +void SvdRescalerTestPowerDiag() { + // this tests the case where f() is a power function with random exponent, + // and the matrix is diagonal. + int32 dim = 10; + BaseFloat power = 0.25 * RandInt(0, 4); + Matrix mat(dim, dim); + for (int32 i = 0; i < dim; i++) + mat(i, i) = 0.25 * RandInt(0, 10); + + SvdRescaler sc; + sc.Init(&mat, false); + + BaseFloat *lambda = sc.InputSingularValues(), + *f_lambda= sc.OutputSingularValues(), + *fprime_lambda = sc.OutputSingularValueDerivs(); + for (int32 i = 0; i < dim; i++) { + f_lambda[i] = pow(lambda[i], power); + fprime_lambda[i] = power * pow(lambda[i], power - 1.0); + } + Matrix output(dim, dim, kUndefined); + sc.GetOutput(&output); + KALDI_ASSERT(mat.IsDiagonal(0.001)); + Matrix output_deriv(dim, dim, kUndefined), + input_deriv(dim, dim); + output_deriv.SetRandn(); + sc.ComputeInputDeriv(output_deriv, &input_deriv); + + for (int32 i = 0; i < dim; i++) { + BaseFloat oderiv = output_deriv(i, i), + ideriv = input_deriv(i, i), + x = mat(i, i), + df = power * pow(x, power - 1.0); + AssertEqual(ideriv, oderiv * df); + } +} + } // namespace kaldi int main() { for (int32 i = 0; i < 10; i++) { kaldi::SvdRescalerTestIdentity(); + kaldi::SvdRescalerTestPowerDiag(); } std::cout << "Test OK.\n"; } From fd0fa946dfe6e83c86715c1800bc5c77e173b7e1 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 22 Nov 2018 00:24:17 -0500 Subject: [PATCH 21/87] [src] More fixes --- src/matrix/matrix-functions-test.cc | 7 ++++++- src/matrix/matrix-functions.cc | 9 ++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/matrix/matrix-functions-test.cc b/src/matrix/matrix-functions-test.cc index 464b46a0e3e..981f23c7746 100644 --- a/src/matrix/matrix-functions-test.cc +++ b/src/matrix/matrix-functions-test.cc @@ -59,8 +59,13 @@ void SvdRescalerTestPowerDiag() { int32 dim = 10; BaseFloat power = 0.25 * RandInt(0, 4); Matrix mat(dim, dim); - for (int32 i = 0; i < dim; i++) + for (int32 i = 0; i < dim; i++) { mat(i, i) = 0.25 * RandInt(0, 10); + // if power < 1.0, we can't allow zero diagonal + // elements, or the derivatives would be undefined. + if (power < 1.0 && mat(i, i) == 0.0) + mat(i, i) = 0.333; + } SvdRescaler sc; sc.Init(&mat, false); diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 0475e1f8b3c..a4a7ccbd099 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -842,9 +842,12 @@ void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, double lambda_j = lambda[j], lambda2_j = lambda_j * lambda_j, d_j = f_lambda[j], bar_d_ij = bar_d(i, j), bar_d_ji = bar_d(j, i), bar_lambda_ij; - // if lambda_i and lambda_j are not (relatively) too close in value (which - // implies that at least one them is nonzero).. - if (std::abs(lambda_i - lambda_j) > 1.0e-03 * std::abs(lambda_i)) { + + if (i == j) { + bar_lambda_ij = bar_d_ij * f_lambda_deriv[i]; + } else if (std::abs(lambda_i - lambda_j) > 1.0e-03 * std::abs(lambda_i)) { + // if lambda_i and lambda_j are not (relatively) too close in value (which + // implies that at least one them is nonzero).. bar_lambda_ij = bar_d_ij * ((lambda_i * d_i - lambda_j * d_j) / (lambda2_i - lambda2_j)) + bar_d_ji * ((lambda_j * d_i - lambda_i * d_j) / From 91d5743b1a38f8768fb120ec1016fdf7500481f4 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 22 Nov 2018 00:54:52 -0500 Subject: [PATCH 22/87] Further fixes. --- src/matrix/matrix-functions-test.cc | 82 ++++++++++++++++++++++++++++- src/matrix/matrix-functions.cc | 10 ++-- 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/src/matrix/matrix-functions-test.cc b/src/matrix/matrix-functions-test.cc index 981f23c7746..203892a54e3 100644 --- a/src/matrix/matrix-functions-test.cc +++ b/src/matrix/matrix-functions-test.cc @@ -21,6 +21,7 @@ #include "matrix/matrix-functions.h" #include "matrix/kaldi-vector.h" #include "matrix/kaldi-matrix.h" +#include "matrix/sp-matrix.h" namespace kaldi { @@ -31,9 +32,10 @@ void SvdRescalerTestIdentity() { if (RandInt(0, 1) == 0) mat.SetRandn(); // else zero. + bool symmetric = false; SvdRescaler sc; - sc.Init(&mat, false); + sc.Init(&mat, symmetric); BaseFloat *lambda = sc.InputSingularValues(), *f_lambda= sc.OutputSingularValues(), @@ -58,6 +60,7 @@ void SvdRescalerTestPowerDiag() { // and the matrix is diagonal. int32 dim = 10; BaseFloat power = 0.25 * RandInt(0, 4); + bool symmetric = (RandInt(0, 1) == 0); Matrix mat(dim, dim); for (int32 i = 0; i < dim; i++) { mat(i, i) = 0.25 * RandInt(0, 10); @@ -68,7 +71,7 @@ void SvdRescalerTestPowerDiag() { } SvdRescaler sc; - sc.Init(&mat, false); + sc.Init(&mat, symmetric); BaseFloat *lambda = sc.InputSingularValues(), *f_lambda= sc.OutputSingularValues(), @@ -95,12 +98,87 @@ void SvdRescalerTestPowerDiag() { } +void SvdRescalerTestExp() { + // this tests the case where f() is the exponential function, and the matrix + // is an arbitrary matrix. + int32 dim = 10; + //bool symmetric = (RandInt(0, 1) == 0); + bool symmetric = false; + BaseFloat exp_scale = 0.2 * RandInt(0, 5); + + Matrix mat(dim, dim); + + if (symmetric) { + SpMatrix s(dim); + s.SetRandn(); + mat.CopyFromSp(s); + } else { + mat.SetRandn(); + } + + KALDI_LOG << "Matrix sum is " << mat.Sum(); + + SvdRescaler sc; + sc.Init(&mat, symmetric); + BaseFloat *lambda = sc.InputSingularValues(), + *f_lambda= sc.OutputSingularValues(), + *fprime_lambda = sc.OutputSingularValueDerivs(); + for (int32 i = 0; i < dim; i++) { + f_lambda[i] = exp(exp_scale * lambda[i]); + fprime_lambda[i] = exp_scale * exp(exp_scale * lambda[i]); + } + Matrix output(dim, dim, kUndefined); + sc.GetOutput(&output); + Matrix output_deriv(dim, dim, kUndefined), + input_deriv(dim, dim); + output_deriv.SetRandn(); + sc.ComputeInputDeriv(output_deriv, &input_deriv); + + + // use random directions to test the accuracy of the derivatives. + int32 n = 4; + Vector expected_change(n), actual_change(n); + BaseFloat epsilon = 0.001; + for (int32 k = 0; k < n; k++) { + Matrix delta(dim, dim); + if (symmetric) { + SpMatrix s(dim); + s.SetRandn(); + delta.CopyFromSp(s); + } else { + delta.SetRandn(); + } + delta.Scale(epsilon); + expected_change(k) = TraceMatMat(delta, input_deriv, kTrans); + delta.AddMat(1.0, mat); + SvdRescaler sc2(&delta, symmetric); + BaseFloat *lambda = sc2.InputSingularValues(), + *f_lambda= sc2.OutputSingularValues(), + *fprime_lambda = sc2.OutputSingularValueDerivs(); + for (int32 i = 0; i < dim; i++) { + f_lambda[i] = exp(exp_scale * lambda[i]); + fprime_lambda[i] = exp_scale * exp(exp_scale * lambda[i]); + } + Matrix output_perturbed(dim, dim); + sc2.GetOutput(&output_perturbed); + actual_change(k) = TraceMatMat(output_deriv, output_perturbed, kTrans) - + TraceMatMat(output_deriv, output, kTrans); + } + KALDI_LOG << "Matrix sum is " << mat.Sum(); + KALDI_LOG << "Predicted " << expected_change + << " vs. actual " << actual_change; + AssertEqual(expected_change, actual_change, 0.01); +} + + + } // namespace kaldi int main() { for (int32 i = 0; i < 10; i++) { kaldi::SvdRescalerTestIdentity(); kaldi::SvdRescalerTestPowerDiag(); + kaldi::SvdRescalerTestExp(); } std::cout << "Test OK.\n"; } diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index a4a7ccbd099..7a222026010 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -825,7 +825,10 @@ void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, // \bar{D} in the writeup; see class declaration. Matrix bar_d(dim, dim); - bar_d.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, Vt_, kTrans, 0.0); + if (!symmetric_) + bar_d.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, Vt_, kTrans, 0.0); + else + bar_d.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, U_, kNoTrans, 0.0); Matrix bar_lambda(dim, dim); @@ -870,13 +873,12 @@ void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, bar_lambda(i, j) = bar_lambda_ij; } } - if (!symmetric_) { + if (!symmetric_) input_deriv->AddMatMatMat(1.0, U_, kNoTrans, bar_lambda, kNoTrans, Vt_, kNoTrans, 0.0); - } else { + else input_deriv->AddMatMatMat(1.0, U_, kNoTrans, bar_lambda, kNoTrans, U_, kTrans, 0.0); - } } } // end namespace kaldi From 35442a6f92354e1149e10dbcac56f7fa78c6bcdc Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 27 Nov 2018 21:40:44 -0500 Subject: [PATCH 23/87] [src] Commit nearly-working version of differentiable fmllr code (var derivs not right) --- src/transform/differentiable-fmllr-test.cc | 365 ++++++++++- src/transform/differentiable-fmllr.cc | 538 ++++++++++++++-- src/transform/differentiable-fmllr.h | 566 ++++++++++++++++- src/transform/differentiable-transform.h | 688 +++++++++++++++++++++ 4 files changed, 2097 insertions(+), 60 deletions(-) create mode 100644 src/transform/differentiable-transform.h diff --git a/src/transform/differentiable-fmllr-test.cc b/src/transform/differentiable-fmllr-test.cc index 4ea12936997..9d4ab6c9cd5 100644 --- a/src/transform/differentiable-fmllr-test.cc +++ b/src/transform/differentiable-fmllr-test.cc @@ -15,9 +15,10 @@ // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and -//1 limitations under the License. +// limitations under the License. #include "transform/differentiable-fmllr.h" +#include "matrix/sp-matrix.h" namespace kaldi { namespace differentiable_transform { @@ -37,7 +38,7 @@ void TestCoreFmllrEstimatorKDeriv( actual_changes(num_directions); int32 dim = G.NumRows(); - BaseFloat epsilon = 1.0e-04 * gamma; + BaseFloat epsilon = 1.0e-03 * gamma; Matrix A_deriv(dim, dim); // A_deriv defines the objective function: a random linear function in A. A_deriv.SetRandn(); @@ -52,7 +53,7 @@ void TestCoreFmllrEstimatorKDeriv( K_new.Scale(epsilon); expected_changes(i) = TraceMatMat(K_new, K_deriv, kTrans); K_new.AddMat(1.0, K); - CoreFmllrEstimatorOptions opts; + FmllrEstimatorOptions opts; Matrix A_new(dim, dim); CoreFmllrEstimator estimator2(opts, gamma, G, K_new, &A_new); estimator2.Forward(); @@ -63,6 +64,8 @@ void TestCoreFmllrEstimatorKDeriv( actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); } + KALDI_LOG << "Expected changes: " << expected_changes + << ", actual changes: " << actual_changes; if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { KALDI_ERR << "Expected and actual changes differ too much: " << expected_changes << " vs. " @@ -70,6 +73,61 @@ void TestCoreFmllrEstimatorKDeriv( } } +// Test derivatives produced by the Estimator object for G. +void TestCoreFmllrEstimatorGDeriv( + BaseFloat gamma, + const Matrix &G, + const Matrix &K, + const Matrix &A, + CoreFmllrEstimator *estimator) { + + int32 num_directions = 4; + Vector expected_changes(num_directions), + actual_changes(num_directions); + + int32 dim = G.NumRows(); + BaseFloat epsilon = 1.0e-03 * gamma; + Matrix A_deriv(dim, dim); + // A_deriv defines the objective function: a random linear function in A. + A_deriv.SetRandn(); + + Matrix G_deriv(dim, dim), + K_deriv(dim, dim); + estimator->Backward(A_deriv, &G_deriv, &K_deriv); + + KALDI_ASSERT(G_deriv.IsSymmetric()); + + for (int32 i = 0; i < num_directions; i++) { + Matrix G_new(dim, dim); + { + SpMatrix s(dim); + s.SetRandn(); + G_new.CopyFromSp(s); + } + G_new.Scale(epsilon); + expected_changes(i) = TraceMatMat(G_new, G_deriv, kTrans); + G_new.AddMat(1.0, G); + FmllrEstimatorOptions opts; + Matrix A_new(dim, dim); + CoreFmllrEstimator estimator2(opts, gamma, G_new, K, &A_new); + estimator2.Forward(); + A_new.AddMat(-1.0, A); + // compute the change in our random linear objective function defined by + // A_deriv, that would be produced by taking some small random change in K + // and computing the A that results from that. + actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); + } + + KALDI_LOG << "Expected changes: " << expected_changes + << ", actual changes: " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + void UnitTestCoreFmllrEstimatorSimple() { int32 dim = RandInt(10, 20); @@ -78,7 +136,7 @@ void UnitTestCoreFmllrEstimatorSimple() { K(dim, dim), A(dim, dim, kUndefined); G.AddToDiag(1.234 * gamma); K.AddToDiag(0.234 * gamma); - CoreFmllrEstimatorOptions opts; + FmllrEstimatorOptions opts; CoreFmllrEstimator estimator(opts, gamma, G, K, &A); BaseFloat objf_impr = estimator.Forward(); KALDI_LOG << "A is " << A; @@ -86,10 +144,301 @@ void UnitTestCoreFmllrEstimatorSimple() { KALDI_ASSERT(fabs(objf_impr) < 0.01); for (int32 i = 0; i < 5; i++) { TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); - // TestCoreFmllrEstimatorGDeriv(G, K, A, &estimator); + TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator); + } +} + +void UnitTestCoreFmllrEstimatorGeneral() { + int32 dim = RandInt(10, 20); + BaseFloat gamma = RandInt(5, 10); + Matrix G(dim, dim), + K(dim, dim), A(dim, dim, kUndefined); + + { + // make sure G is symmetric and +ve definite. + Matrix A(dim, dim + 5); + A.SetRandn(); + G.AddMatMat(gamma, A, kNoTrans, A, kTrans, 0.0); + } + + K.SetRandn(); + K.Scale(gamma); + FmllrEstimatorOptions opts; + CoreFmllrEstimator estimator(opts, gamma, G, K, &A); + BaseFloat objf_impr = estimator.Forward(); + KALDI_LOG << "A is " << A << ", objf impr is " << objf_impr; + for (int32 i = 0; i < 5; i++) { + TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); + TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator); + } +} + +void TestGaussianEstimatorDerivs(const MatrixBase &feats, + const Posterior &post, + const FmllrEstimatorOptions &opts, + GaussianEstimator *g) { + int32 n = 4; // number of delta-params we use. + Vector expected_changes(n), + actual_changes(n); + + // if !test_mean_deriv, then we test the var deriv. + bool test_mean_deriv = (RandInt(0, 1) == 0); + + int32 num_classes = g->NumClasses(), dim = g->Dim(); + + Matrix mean_derivs(num_classes, dim); + Vector var_derivs(num_classes); + if (test_mean_deriv) { + KALDI_LOG << "Testing mean derivs."; + mean_derivs.SetRandn(); + } else { + KALDI_LOG << "Testing var derivs."; + var_derivs.SetRandn(); + } + g->SetOutputDerivs(mean_derivs, var_derivs); + Matrix feats_deriv(feats.NumRows(), feats.NumCols()); + g->Backward(feats, post, &feats_deriv); + + BaseFloat epsilon = 1.0e-03; + + for (int32 i = 0; i < n; i++) { + Matrix new_feats(feats.NumRows(), + feats.NumCols()); + new_feats.SetRandn(); + new_feats.Scale(epsilon); + + expected_changes(i) = TraceMatMat(feats_deriv, new_feats, kTrans); + + new_feats.AddMat(1.0, feats); + + GaussianEstimator g2(num_classes, dim); + g2.AccStats(new_feats, post); + g2.Estimate(opts); + + actual_changes(i) = + TraceMatMat(mean_derivs, g2.GetMeans(), kTrans) - + TraceMatMat(mean_derivs, g->GetMeans(), kTrans) + + VecVec(var_derivs, g2.GetVars()) - + VecVec(var_derivs, g->GetVars()); + } + KALDI_LOG << "Actual changes are " << actual_changes + << " vs. predicted " << expected_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + +void TestFmllrEstimatorMeanDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const MatrixBase &mu_deriv = f.GetMeanDeriv(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_mu(num_classes, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_mu.SetRandn(); + new_mu.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); + new_mu.AddMat(1.0, mu); + FmllrEstimator f2(opts, new_mu, s); + f2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; } } +void TestFmllrEstimatorVarDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const VectorBase &s_deriv = f.GetVarDeriv(); + + // measure the accuracy of the deriv in 10 random directions + int32 n = 10; + BaseFloat epsilon = 0.1; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Vector new_s(num_classes, kUndefined); + Matrix new_adapted_feats(T, dim, kUndefined); + new_s.SetRandn(); + new_s.Scale(epsilon); + expected_changes(i) = VecVec(new_s, s_deriv); + new_s.AddVec(1.0, s); + FmllrEstimator f2(opts, mu, new_s); + f2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + + +void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + int32 T = feats.NumRows(), dim = feats.NumCols(); + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_feats(T, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_feats.SetRandn(); + new_feats.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); + new_feats.AddMat(1.0, feats); + FmllrEstimator f2(opts, mu, s); + f2.ForwardCombined(new_feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void UnitTestGaussianAndFmllrEstimator() { + int32 num_classes = RandInt(50, 100), + dim = RandInt(5, 10), + num_frames = RandInt(40 * num_classes, 100 * num_classes); + + GaussianEstimator g(num_classes, dim); + + Matrix feats(num_frames, dim); + feats.SetRandn(); + Posterior post(num_frames); + for (int32 t = 0; t < num_frames; t++) { + int32 n = RandInt(0, 2); + for (int32 j = 0; j < n; j++) { + int32 i = RandInt(0, num_classes - 1); + BaseFloat p = 0.25 * RandInt(1, 5); + post[t].push_back(std::pair(i, p)); + } + } + g.AccStats(feats, post); + FmllrEstimatorOptions opts; + opts.variance_sharing_weight = 0.25 * RandInt(0, 4); // will try other values later. + g.Estimate(opts); + KALDI_LOG << "Means are: " + << g.GetMeans() << ", vars are: " + << g.GetVars(); + TestGaussianEstimatorDerivs(feats, post, opts, &g); + + TestFmllrEstimatorFeatDerivs(feats, post, g); + TestFmllrEstimatorMeanDerivs(feats, post, g); + TestFmllrEstimatorVarDerivs(feats, post, g); +} + + } // namespace kaldi } // namespace differentiable_transform @@ -99,6 +448,10 @@ void UnitTestCoreFmllrEstimatorSimple() { int main() { using namespace kaldi::differentiable_transform; - UnitTestCoreFmllrEstimatorSimple(); + for (int32 i = 0; i < 5; i++) { + UnitTestCoreFmllrEstimatorSimple(); + UnitTestCoreFmllrEstimatorGeneral(); + UnitTestGaussianAndFmllrEstimator(); + } std::cout << "Test OK.\n"; } diff --git a/src/transform/differentiable-fmllr.cc b/src/transform/differentiable-fmllr.cc index 59fc1d59507..645977ce606 100644 --- a/src/transform/differentiable-fmllr.cc +++ b/src/transform/differentiable-fmllr.cc @@ -24,7 +24,7 @@ namespace kaldi { namespace differentiable_transform { CoreFmllrEstimator::CoreFmllrEstimator( - const CoreFmllrEstimatorOptions &opts, + const FmllrEstimatorOptions &opts, BaseFloat gamma, const MatrixBase &G, const MatrixBase &K, @@ -50,28 +50,31 @@ void CoreFmllrEstimator::ComputeH() { int32 dim = G_.NumRows(); bool symmetric = true; G_rescaler_.Init(&G_, symmetric); - VectorBase &G_singular_values = G_rescaler_.InputSingularValues(); - BaseFloat floor = - G_singular_values.Max() * opts_.singular_value_relative_floor; - KALDI_ASSERT(floor > 0.0); - MatrixIndexT num_floored = 0; - G_singular_values.ApplyFloor(floor, &num_floored); - if (num_floored > 0.0) - KALDI_WARN << num_floored << " out of " << dim - << " singular values floored in G matrix."; - VectorBase - &H_singular_values = *G_rescaler_.OutputSingularValues(), - &H_singular_value_derivs = *G_rescaler_.OutputSingularValueDerivs(); - H_singular_values.CopyFromVec(G_singular_values); - // H is going to be G^{-0.5}. - // We don't have to worry about division by zero because we already floored the - // singular values of G. - H_singular_values.ApplyPow(-0.5); - // the derivative of lambda^{-0.5} w.r.t. lambda is -0.5 lambda^{-1.5}; - // we fill in this value in H_singular_value_derivs. - H_singular_value_derivs.CopyFromVec(G_singular_values); - H_singular_value_derivs.ApplyPow(-1.5); - H_singular_value_derivs.Scale(-0.5); + BaseFloat *G_singular_values = G_rescaler_.InputSingularValues(); + + { + SubVector v(G_singular_values, dim); + BaseFloat floor = v.Max() * opts_.singular_value_relative_floor; + KALDI_ASSERT(floor > 0.0); + MatrixIndexT num_floored = 0; + v.ApplyFloor(floor, &num_floored); + if (num_floored > 0.0) + KALDI_WARN << num_floored << " out of " << dim + << " singular values floored in G matrix."; + } + BaseFloat *H_singular_values = G_rescaler_.OutputSingularValues(), + *H_singular_value_derivs = G_rescaler_.OutputSingularValueDerivs(); + // We don't have to worry about elements of G_singular_values being zero, + // since we floored them above. + for (int32 i = 0; i < dim; i++) { + H_singular_values[i] = 1.0 / std::sqrt(G_singular_values[i]); + // The following expression is equivalent to + // -0.5 * pow(G_singular_values[i], -1.5), + // which is the derivative of lambda^{-0.5} w.r.t lambda. + // (lambda, here, is G_singular_values[i]). + H_singular_value_derivs[i] = -0.5 * (H_singular_values[i] / + G_singular_values[i]); + } H_.Resize(dim, dim, kUndefined); G_rescaler_.GetOutput(&H_); } @@ -89,24 +92,22 @@ void CoreFmllrEstimator::ComputeB() { int32 dim = L_.NumRows(); bool symmetric = false; L_rescaler_.Init(&L_, symmetric); - VectorBase &L_singular_values = L_rescaler_.InputSingularValues(); - BaseFloat floor = - L_singular_values.Max() * opts_.singular_value_relative_floor; - KALDI_ASSERT(floor > 0.0); - MatrixIndexT num_floored = 0; - L_singular_values.ApplyFloor(floor, &num_floored); - if (num_floored > 0.0) - KALDI_WARN << num_floored << " out of " << dim - << " singular values floored in K matrix."; - VectorBase - &B_singular_values = *L_rescaler_.OutputSingularValues(), - &B_singular_value_derivs = *L_rescaler_.OutputSingularValueDerivs(); - // lambda is the original singular value of l, - // f is where we put f(lambda) - // f_prime is where we put f'(lambda) (the derivative of f w.r.t lambda). - BaseFloat *lambda = L_singular_values.Data(), - *f = B_singular_values.Data(), - *f_prime = B_singular_value_derivs.Data(); + BaseFloat *lambda = L_rescaler_.InputSingularValues(); + { // This block deals with flooring lambda to avoid zero values. + SubVector v(lambda, dim); + BaseFloat floor = v.Max() * opts_.singular_value_relative_floor; + KALDI_ASSERT(floor > 0.0); + MatrixIndexT num_floored = 0; + v.ApplyFloor(floor, &num_floored); + if (num_floored > 0.0) + KALDI_WARN << num_floored << " out of " << dim + << " singular values floored in L matrix."; + } + // f is where we put f(lambda). + // f_prime is where we put f'(lambda) (the function-derivative of f w.r.t + // lambda). + BaseFloat *f = L_rescaler_.OutputSingularValues(), + *f_prime = L_rescaler_.OutputSingularValueDerivs(); BaseFloat gamma = gamma_; for (int32 i = 0; i < dim; i++) { @@ -124,6 +125,35 @@ void CoreFmllrEstimator::ComputeA() { A_->AddMatMat(1.0, B_, kNoTrans, H_, kNoTrans, 0.0); } +BaseFloat CoreFmllrEstimator::ComputeObjfChange() { + // we are computing the objective-function improvement from estimating + // A (we'll later compute the improvement from estimating the offset b). + // This is the equation which, from the writeup, is: + // \gamma log |A| + tr(A^T K) - tr(K) + // + 1/2 tr(G) - 1/2 tr(B B^T). + // and we note that log |A| = log |B| + log |G^{-0.5}| = log |B| -0.5 log |G|. + // Here, |.| actually means the absolute value of the determinant. + + int32 dim = L_.NumRows(); + double logdet_g = 0.0, logdet_b = 0.0, tr_b_bt = 0.0, tr_g = 0.0; + BaseFloat *G_singular_values = G_rescaler_.InputSingularValues(), + *B_singular_values = L_rescaler_.OutputSingularValues(); + for (int32 i = 0; i < dim; i++) { + // we have already ensured that G_singular_values[i] > 0. + logdet_g += Log(G_singular_values[i]); + tr_g += G_singular_values[i]; + logdet_b += Log(B_singular_values[i]); + tr_b_bt += B_singular_values[i] * B_singular_values[i]; + } + + double logdet_A = logdet_b - 0.5 * logdet_g, + tr_at_k = TraceMatMat(*A_, K_, kTrans), + tr_k = K_.Trace(); + + return BaseFloat( + gamma_ * logdet_A + tr_at_k - tr_k + 0.5 * tr_g - 0.5 * tr_b_bt); +} + void CoreFmllrEstimator::BackpropA(const MatrixBase &A_deriv, MatrixBase *B_deriv, MatrixBase *H_deriv) { @@ -161,5 +191,431 @@ void CoreFmllrEstimator::Backward(const MatrixBase &A_deriv, } } + +GaussianEstimator::GaussianEstimator(int32 num_classes, int32 feature_dim): + gamma_(num_classes), + m_(num_classes, feature_dim), + v_(num_classes) { + KALDI_ASSERT(num_classes > 0 && feature_dim > 0); +} + +void GaussianEstimator::AccStats(const MatrixBase &feats, + const Posterior &post) { + KALDI_ASSERT(static_cast(post.size()) == feats.NumRows()); + int32 T = feats.NumRows(); + auto iter = post.begin(); + for (int32 t = 0; t < T; t++,++iter) { + SubVector feat(feats, t); + const std::vector > this_post = *iter; + auto iter2 = this_post.begin(), + end2 = this_post.end(); + for (; iter2 != end2; ++iter2) { + int32 i = iter2->first; + BaseFloat p = iter2->second; + gamma_(i) += p; + SubVector this_m(m_, i); + this_m.AddVec(p, feat); + v_(i) += p * VecVec(feat, feat); + } + } +} + +void GaussianEstimator::Estimate(const FmllrEstimatorOptions &opts) { + variance_floor_ = opts.variance_floor; + variance_sharing_weight_ = opts.variance_sharing_weight; + KALDI_ASSERT(variance_floor_ > 0.0 && + variance_sharing_weight_ >= 0.0 && + variance_sharing_weight_ <= 1.0); + KALDI_ASSERT(mu_.NumRows() == 0 && + "You cannot call Estimate() twice."); + int32 num_classes = m_.NumRows(); + + mu_ = m_; + s_.Resize(num_classes, kUndefined); + t_.Resize(num_classes, kUndefined); + for (int32 i = 0; i < num_classes; i++) { + BaseFloat gamma_i = gamma_(i); + if (gamma_i == 0.0) { + // the i'th row of mu will already be zero. + s_(i) = variance_floor_; + } else { + SubVector mu_i(mu_, i); + // We already copied m_ to mu_. + mu_i.Scale(1.0 / gamma_i); + s_(i) = std::max(variance_floor_, + v_(i) / gamma_i - VecVec(mu_i, mu_i)); + } + } + + // apply variance_sharing_weight_. + BaseFloat gamma = gamma_.Sum(), + s = VecVec(gamma_, s_) / gamma, + f = variance_sharing_weight_; + KALDI_ASSERT(gamma != 0.0 && + "You cannot call Estimate() with no stats."); + for (int32 i = 0; i < num_classes; i++) { + t_(i) = (BaseFloat(1.0) - f) * s_(i) + f * s; + } + // Clear the stats, which won't be needed any longer. + m_.Resize(0, 0); + v_.Resize(0); +} + +void GaussianEstimator::SetOutputDerivs( + const MatrixBase &mean_derivs, + const VectorBase &var_derivs) { + KALDI_ASSERT(SameDim(mean_derivs, mu_) && + var_derivs.Dim() == t_.Dim()); + int32 num_classes = mean_derivs.NumRows(), + dim = mean_derivs.NumCols(); + BaseFloat f = variance_sharing_weight_, + variance_floor = variance_floor_, + gamma = gamma_.Sum(); + KALDI_ASSERT(gamma > 0.0); + m_bar_.Resize(num_classes, dim); + v_bar_.Resize(num_classes, kUndefined); + + const VectorBase &t_bar(var_derivs); + const MatrixBase &mu_bar(mean_derivs); + BaseFloat s_bar = f * t_bar.Sum(); + for (int32 i = 0; i < num_classes; i++) { + SubVector m_bar_i(m_bar_, i); + BaseFloat gamma_i = gamma_(i); + if (gamma_i == 0.0 || s_(i) == variance_floor) { + v_bar_(i) = 0.0; + } else { + BaseFloat s_bar_i = (BaseFloat(1.0) - f) * t_bar(i) + s_bar * gamma_i / gamma; + v_bar_(i) = s_bar_i / gamma_i; + m_bar_i.AddVec(-2.0 * s_bar_i / gamma_i, mu_.Row(i)); + } + if (gamma_i != 0.0) { + m_bar_i.AddVec(1.0 / gamma_i, mu_bar.Row(i)); + } + } +} + +int32 GaussianEstimator::Dim() const { + // One of these two will be nonempty. + return std::max(m_.NumCols(), mu_.NumCols()); +} + +void GaussianEstimator::Backward(const MatrixBase &feats, + const Posterior &post, + const MatrixBase *feats_deriv) { + // The equation we're implementing is: + // \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t) + // See the comment in the header: + // "Notes on implementation of GaussianEstimator". + int32 T = feats.NumRows(); + KALDI_ASSERT(static_cast(post.size() == T) && + SameDim(feats, *feats_deriv)); + auto iter = post.begin(); + for (int32 t = 0; t < T; t++,iter++) { + SubVector feat(feats, t), + feat_deriv(*feats_deriv, t); + const std::vector > this_post = *iter; + auto iter2 = this_post.begin(), + end2 = this_post.end(); + for (; iter2 != end2; ++iter2) { + int32 i = iter2->first; + BaseFloat p = iter2->second; + SubVector m_bar_i(m_bar_, i); + feat_deriv.AddVec(p, m_bar_i); + feat_deriv.AddVec(p * 2.0 * v_bar_(i), feat); + } + } +} + + +FmllrEstimator::FmllrEstimator(const FmllrEstimatorOptions &opts, + const MatrixBase &mu, + const VectorBase &s): + opts_(opts), mu_(mu), s_(s), estimator_(NULL) { + int32 num_classes = mu_.NumRows(), dim = mu_.NumCols(); + opts_.Check(); + + gamma_.Resize(num_classes); + G_.Resize(dim, dim); + K_.Resize(dim, dim); + n_.Resize(dim); +} + +void FmllrEstimator::AccStats(const MatrixBase &feats, + const Posterior &post) { + KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); + int32 num_classes = mu_.NumRows(), + dim = mu_.NumCols(), + T = feats.NumRows(); + // Use temporaries for the stats and later add them to the stats in the class; + // this will reduce roundoff errors if this function is called more than once. + // Also do this every 100 frames or so, again, to reduce roundoff. + SpMatrix G(dim); + Matrix K(dim, dim); + Vector gamma(num_classes), + n(dim); + for (int32 t = 0; t < T; t++) { + auto iter = post[t].begin(), end = post[t].end(); + SubVector x_t(feats, t); + BaseFloat gamma_hat_t = 0.0; + for (; iter != end; ++iter) { + int32 i = iter->first; + BaseFloat gamma_ti = iter->second, + gamma_hat_ti = gamma_ti / s_(i); + SubVector mu_i(mu_, i); + gamma(i) += gamma_ti; + gamma_hat_t += gamma_hat_ti; + K.AddVecVec(gamma_hat_ti, mu_i, x_t); + } + G.AddVec2(gamma_hat_t, x_t); + n.AddVec(gamma_hat_t, x_t); + + if (t == T - 1 || (t > 0 && t % 100 == 0)) { + gamma_.AddVec(1.0, gamma); + G_.AddSp(1.0, G); + K_.AddMat(1.0, K); + n_.AddVec(1.0, n); + if (t < T - 1) { + gamma.SetZero(); + G.SetZero(); + K.SetZero(); + n.SetZero(); + } + } + } +} + + +BaseFloat FmllrEstimator::Estimate() { + // If at some point you need to create a version of Estimate() that can be + // called multiple times (e.g. for online applications), it will likely be + // easiest to create a 'const' version of Estimate() that outputs A and b via + // pointers. This one modifies the G_ and K_ quantities, which is what makes + // it tricky to do correctly if called twice. + if (A_.NumRows() != 0) + KALDI_ERR << "You cannot call Estimate() twice."; + int32 dim = mu_.NumCols(); + BaseFloat gamma_tot = gamma_.Sum(); + KALDI_ASSERT(gamma_tot > 0.0 && + "You cannot call Estimate() with zero stats."); + + gamma_hat_ = gamma_; + gamma_hat_.DivElements(s_); + gamma_hat_tot_ = gamma_hat_.Sum(); + n_.Scale(1.0 / gamma_hat_tot_); + + m_.Resize(dim); + m_.AddMatVec(1.0 / gamma_hat_tot_, mu_, kTrans, gamma_hat_, 0.0); + K_.AddVecVec(-gamma_hat_tot_, m_, n_); + G_.AddVecVec(-gamma_hat_tot_, n_, n_); + KALDI_ASSERT(G_.IsSymmetric(0.001)); + // Make sure G_ is perfectly symmetric, which, mathematically, it is. + G_.CopyLowerToUpper(); + A_.Resize(dim, dim, kUndefined); + + BaseFloat gamma_tot_smoothed = gamma_tot; + { + /* + Add smoothing counts to gamma_tot, K_ and G_. This prevents the matrix + from diverging too far from the identity, and ensures more reasonable + transform values when counts are small or dimensions large. We can ignore + this smoothing for computing derivatives, because it happens that it + doesn't affect anything; the quantities gamma_, K_ and G_ are never + consumed in the backprop phase, and the expressions for the derivatives + w.r.t. these quantities don't change from adding an extra term. + */ + gamma_tot_smoothed = gamma_tot + opts_.smoothing_count; + BaseFloat s = opts_.smoothing_between_class_factor; + K_.AddToDiag(opts_.smoothing_count * s); + G_.AddToDiag(opts_.smoothing_count * (1.0 + s)); + } + // Compute A_. + estimator_ = new CoreFmllrEstimator(opts_, gamma_tot_smoothed, G_, K_, &A_); + // A_impr will be the objective-function improvement from estimating A + // (vs. the unit matrix), divided by gamma_tot. Note: the likelihood of the + // 'fake data' we used for the smoothing could only have been made worse by + // estimating this transform, so dividing the total objf-impr by gamma_tot + // (rather than gamma_tot_smoothed, if different) will still be an + // underestimate of the actual improvement. + BaseFloat A_impr = (1.0 / gamma_tot) * estimator_->Forward(); + + // Compute b = m - A n. + b_ = m_; + b_.AddMatVec(-1.0, A_, kNoTrans, n_, 1.0); + + // b_impr is the amount of objective-function improvement from estimating b + // (vs. the default value), divided by the total-count gamma_tot. See section + // 'diagnostics' in the document. + // Note: we aren't doing any smoothing for the offset term. + BaseFloat b_impr = (0.5 * VecVec(b_, b_) * gamma_hat_tot_) / gamma_tot; + return A_impr + b_impr; +} + + + +void FmllrEstimator::AdaptFeatures(const MatrixBase &feats, + MatrixBase *adapted_feats) const { + KALDI_ASSERT(A_.NumRows() != 0 && "You cannot call AdaptFeatures before " + "calling Estimate()."); + KALDI_ASSERT(SameDim(feats, *adapted_feats)); + adapted_feats->CopyRowsFromVec(b_); + adapted_feats->AddMatMat(1.0, feats, kNoTrans, A_, kTrans, 1.0); +} + + +void FmllrEstimator::AdaptFeaturesBackward( + const MatrixBase &feats, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv) { + KALDI_ASSERT(SameDim(feats, adapted_feats_deriv) && + SameDim(feats, *feats_deriv)); + // in the writeup: \bar{x}_t <-- A^T \bar{y}_t. + // In this implementation, x_t corresponds to a + // row vector in feats and feats_deriv, so everything is + // transposed to: + // \bar{x}_t^T <--- \bar{y}_t^T A. + feats_deriv->AddMatMat(1.0, adapted_feats_deriv, kNoTrans, + A_, kNoTrans, 1.0); + + // We use temporaries below to possibly reduce roundoff error. + // It's not clear whether this would make a difference-- it depends + // how the BLAS we're using was implemented. + int32 dim = mu_.NumCols(); + // \bar{b} = \sum_t \bar{y}_t + Vector b_bar(dim); + b_bar.AddRowSumMat(1.0, adapted_feats_deriv); + if (b_bar_.Dim() == 0) + b_bar_.Swap(&b_bar); + else + b_bar_.AddVec(1.0, b_bar); + // \bar{A} <-- \sum_t \bar{y}_t x_t^T + Matrix A_bar(dim, dim); + A_bar.AddMatMat(1.0, adapted_feats_deriv, kTrans, feats, kNoTrans, 0.0); + if (A_bar_.NumRows() == 0) + A_bar_.Swap(&A_bar); + else + A_bar_.AddMat(1.0, A_bar); +} + +void FmllrEstimator::EstimateBackward() { + KALDI_ASSERT(G_bar_.NumRows() == 0 && + "You cannot call EstimateBackward() twice."); + KALDI_ASSERT(A_bar_.NumRows() != 0 && + "You must call AdaptFeaturesBackward() before calling " + "EstimateBackward()."); + // do \bar{A} -= \bar{b} n^T + A_bar_.AddVecVec(-1.0, b_bar_, n_); + + int32 num_classes = mu_.NumRows(), dim = mu_.NumCols(); + G_bar_.Resize(dim, dim); + K_bar_.Resize(dim, dim); + estimator_->Backward(A_bar_, &G_bar_, &K_bar_); + delete estimator_; + estimator_ = NULL; + KALDI_ASSERT(G_bar_.IsSymmetric()); + + // \bar{n} = - (\bar{A}^T b + 2\bar{G} n + \bar{K}^T m) + n_bar_.Resize(dim); + n_bar_.AddMatVec(-1.0, A_bar_, kTrans, b_, 0.0); + n_bar_.AddMatVec(-2.0 * gamma_hat_tot_, G_bar_, kNoTrans, n_, 1.0); + n_bar_.AddMatVec(-1.0 * gamma_hat_tot_, K_bar_, kTrans, m_, 1.0); + + // \bar{m} = \bar{b} - \hat{\gamma} \bar{K} n + m_bar_ = b_bar_; + m_bar_.AddMatVec(-gamma_hat_tot_, K_bar_, kNoTrans, n_, 1.0); + + // \bar{\hat{\gamma}} = - n^T \bar{G} n - m^t \bar{K} n + // - \frac{1}{\hat{\gamma}} (n^T \bar{n} + m^T \bar{m}) + gamma_hat_tot_bar_ = -1.0 * VecMatVec(n_, G_bar_, n_) + - VecMatVec(m_, K_bar_, n_) + - (1.0 / gamma_hat_tot_) * (VecVec(n_, n_bar_) + VecVec(m_, m_bar_)); + + // \bar{\hat{\gamma}}_i = \bar{\hat{\gamma}} + \frac{1}{\hat{\gamma}} \mu_i^T \bar{m} + gamma_hat_bar_.Resize(num_classes, kUndefined); + gamma_hat_bar_.Set(gamma_hat_tot_bar_); + gamma_hat_bar_.AddMatVec(1.0 / gamma_hat_tot_, mu_, kNoTrans, m_bar_, 1.0); + + // each row of Kt_bar_mu_ will become \bar{K}^T \mu_i. But the + // expression is transposed below. + Kt_bar_mu_.Resize(num_classes, dim); + Kt_bar_mu_.AddMatMat(1.0, mu_, kNoTrans, K_bar_, kNoTrans, 0.0); + + // \bar{\mu}_i <-- \frac{\hat{\gamma}_i}{\gamma} \bar{m} + // we'll add another term to this later in AccStatsBackward(). + mu_bar_.Resize(num_classes, dim); + mu_bar_.AddVecVec(1.0 / gamma_hat_tot_, gamma_hat_, m_bar_); + + // s_bar_ will be written to in AccStatsBackward(), but we initialize it here. + s_bar_.Resize(num_classes); +} + +void FmllrEstimator::AccStatsBackward( + const MatrixBase &feats, + const Posterior &post, + MatrixBase *feats_deriv) { + KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); + int32 T = feats.NumRows(), num_classes = mu_.NumRows(); + Vector s_bar_temp(num_classes); + for (int32 t = 0; t < T; t++) { + auto iter = post[t].begin(), end = post[t].end(); + SubVector x_t(feats, t), + x_bar_t(*feats_deriv, t); + BaseFloat gamma_hat_t = 0.0; + for (; iter != end; ++iter) { + int32 i = iter->first; + BaseFloat gamma_ti = iter->second, + gamma_hat_ti = gamma_ti / s_(i); + SubVector mu_bar_i(mu_bar_, i); + // \bar{\mu}_i += \hat{\gamma}_{t,i} \bar{K} x_t. + mu_bar_i.AddMatVec(gamma_hat_ti, K_bar_, kNoTrans, x_t, 1.0); + gamma_hat_t += gamma_hat_ti; + SubVector Kt_bar_mu_i(Kt_bar_mu_, i); + // \bar{x}_t += \hat{\gamma}_{t,i} \bar{K}^T \mu_i + x_bar_t.AddVec(gamma_hat_ti, Kt_bar_mu_i); + } + double gamma_hat_bar_t = VecMatVec(x_t, G_bar_, x_t) + + (1.0 / gamma_hat_tot_) * VecVec(x_t, n_bar_); + + // \bar{x}_t += 2 \hat{\gamma}_t \bar{G} x_t + x_bar_t.AddMatVec(2.0 * gamma_hat_t, G_bar_, kNoTrans, x_t, 1.0); + // \bar{x}_t += \frac{\hat{\gamma}_t}{\hat{\gamma}} \bar{n} + x_bar_t.AddVec(gamma_hat_t / gamma_hat_tot_, n_bar_); + + for (iter = post[t].begin(); iter != end; ++iter) { + int32 i = iter->first; + BaseFloat gamma_ti = iter->second; + SubVector mu_i(mu_, i); + double gamma_hat_bar_ti = VecMatVec(mu_i, K_bar_, x_t) + + double(gamma_hat_bar_(i)) + double(gamma_hat_bar_t); + // \bar{s}_i += \frac{-1}{s_i^2} \gamma_{t,i} \bar{\hat{\gamma}}_{t,i} + s_bar_temp(i) -= 1.0 / (s_(i) * s_(i)) * gamma_ti * gamma_hat_bar_ti; + } + } + s_bar_.AddVec(1.0, s_bar_temp); +} + +BaseFloat FmllrEstimator::ForwardCombined( + const MatrixBase &feats, + const Posterior &post, + MatrixBase *adapted_feats) { + AccStats(feats, post); + BaseFloat ans = Estimate(); + AdaptFeatures(feats, adapted_feats); + return ans; +} + +void FmllrEstimator::BackwardCombined( + const MatrixBase &feats, + const Posterior &post, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv) { + AdaptFeaturesBackward(feats, adapted_feats_deriv, feats_deriv); + EstimateBackward(); + AccStatsBackward(feats, post, feats_deriv); +} + +FmllrEstimator::~FmllrEstimator() { + delete estimator_; // in case Estimate() was never called. +} + } // namespace differentiable_transform } // namespace kaldi diff --git a/src/transform/differentiable-fmllr.h b/src/transform/differentiable-fmllr.h index 85c65ffdf02..bfbe5590732 100644 --- a/src/transform/differentiable-fmllr.h +++ b/src/transform/differentiable-fmllr.h @@ -26,6 +26,7 @@ #include "base/kaldi-common.h" #include "util/kaldi-table.h" #include "util/kaldi-holder.h" +#include "hmm/posterior.h" #include "matrix/matrix-functions.h" namespace kaldi { @@ -53,9 +54,7 @@ namespace differentiable_transform { count gamma), and produces A. This has been separated into its own object for purposes of testability. */ - - -struct CoreFmllrEstimatorOptions { +struct FmllrEstimatorOptions { // singular_value_relative_floor is floor that we apply on the // singular values of the inputs G and K, to ensure that no NaN's are @@ -67,11 +66,63 @@ struct CoreFmllrEstimatorOptions { // NaN's). BaseFloat singular_value_relative_floor; - CoreFmllrEstimatorOptions(): - singular_value_relative_floor(0.001) { } + + // Floor for (spherical) variances; will be passed to class GaussianEstimator + // when estimating means and variances. + BaseFloat variance_floor; + + // A value in the range [0, 1] which dictates to what extent the variances are + // shared. 0 means not shared at all, 1 means completely shared. Shared + // means the variance is a weighted average of variances, weighted by count of + // that class. This is consumed by class GaussianEstimator. + BaseFloat variance_sharing_weight; + + // A count value of 'fake' counts that we add to the stats G, K and lambda + // during estimation, namely: + // lambda += smoothing_count + // K += smoothing_count * smoothing_between_class_factor * I + // G += smoothing_count * I. + // Interpretable as a number of frames. This prevents things going crazy + // when the amount of data is small. + BaseFloat smoothing_count; + + // A factor that says how large the assumed between-class covariance matrix is + // relative to the within-class covariance matrix. Should be >= 0. A smaller + // value will mean that the smoothing penalizes rotations of the space less; + // with zero, the smoothing only constrains the singular values of A, not + // its direction. + BaseFloat smoothing_between_class_factor; + + FmllrEstimatorOptions(): + singular_value_relative_floor(0.001), + variance_floor(0.0001), + variance_sharing_weight(0.1), + smoothing_count(0.0), + smoothing_between_class_factor(0.25) { } + + void Check() { + KALDI_ASSERT(singular_value_relative_floor > 0.0 && + singular_value_relative_floor < 0.1 && + (variance_floor > 0.0 || variance_sharing_weight > 0.0) && + variance_floor >= 0.0 && + variance_sharing_weight >= 0.0 && + variance_sharing_weight <= 1.0); + } }; +/** + Class CoreFmllrEstimator takes care of the core parts of the fMLLR estimation: + with reference to the notation in + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf, + it accepts the statistics G and K and the count gamma, and it + computes the fMLLR transform matrix A, and allows you to backprop through + that computation. The reason why we have broken it out as its own class, + is for testability and to limit the complexity of any one class. + + The end-user may want to use class FmllrEstimator instead. + + */ class CoreFmllrEstimator { public: /** @@ -98,10 +149,8 @@ class CoreFmllrEstimator { you call Forward(). May be undefined (e.g., NaN) on entry. You must not change the value of A between calling Forward() and calling Backward(). - - TODO: introduc */ - CoreFmllrEstimator(const CoreFmllrEstimatorOptions &opts, + CoreFmllrEstimator(const FmllrEstimatorOptions &opts, BaseFloat gamma, const MatrixBase &G, const MatrixBase &K, @@ -112,12 +161,11 @@ class CoreFmllrEstimator { 'A' that was passed to the constructor. Returns the objective-function improvement per frame, as compared - with what the objective-function would be with unit A. This equals - the total objective function improvement divided by gamma. - */ + with what the objective-function would be with unit A. This is not + normalized by the number of frames. + */ BaseFloat Forward(); - /** Does the backward pass. Note: it is permissible to call Backward() any number of times, it does not have to be called @@ -169,7 +217,7 @@ class CoreFmllrEstimator { // this estimation. BaseFloat ComputeObjfChange(); - CoreFmllrEstimatorOptions opts_; + FmllrEstimatorOptions opts_; BaseFloat gamma_; const MatrixBase &G_; const MatrixBase &K_; @@ -196,6 +244,498 @@ class CoreFmllrEstimator { }; + +/** + Class GaussianEstimator allows you to estimate means and (spherical) variances + from features and posteriors, and to later backprop through that process if + needed. + + It is intended for use during training of the neural net, for use on + individual minibatches: it uses BaseFloat for the accumulators, which might + lead to excessive roundoff if you had a large amount of data. We'll later on + create a separate mechanism for accumulating stats over all the data, given + the full tree. + */ +class GaussianEstimator { + public: + GaussianEstimator(int32 num_classes, int32 feature_dim); + + int32 NumClasses() const { return gamma_.Dim(); } + + int32 Dim() const; + + // Accumulate statistics (you can call this multiple times of needed). + // It does: for each t, and for each pair (i, f) in post[t], accumulate stats + // from feats.Row(t) with class i and weight f. + // May not be called after Estimate() is called. + // + // @param [in] feats The input features, of dimension + // num-frames by feature-dimension + // @param [in] post The posteriors, which is a + // vector > >. + // Its size() must equal feats.NumRows(). + void AccStats(const MatrixBase &feats, + const Posterior &post); + + // You call this once after calling AccStats() one or more times. + // It estimates the model means and variances. + // See the members 'variance_floor' and 'variance_sharing_weight' + // of the options class. + void Estimate(const FmllrEstimatorOptions &opts); + + // Returns true if Estimate() has previously been called, i.e. if + // the means and variances have been computed. + bool IsEstimated(); + + // Returns the means, in a matrix of dimension num_classes by dim. Must not + // be called if ! IsEstimated(). + const MatrixBase &GetMeans() const { return mu_; } + + // Returns the 's' quantities, which are the scalar factors on the (spherical) + // variances. Must not be called if ! IsEstimated(). The + // variance for class i will actually be s_i I, where s_i is an element of + // this vector. + const VectorBase &GetVars() const { return t_; } + + // You call this to set the derivatives df/dmeans and df/dvars-- + // the derivatives of the objective function f w.r.t. those quantities. + // Doing this allows you to backprop through the estimation of the + // means and variances, back to the features. + // This must only be called after previously calling Estimate(). + // This function writes to v_bar_ and m_bar_. + void SetOutputDerivs(const MatrixBase &mean_derivs, + const VectorBase &var_derivs); + + + // This function, which must only be called after SetOutputDerivs() has + // been called, propagates the derivative back to the features. For + // purposes of this backpropagation, the posteriors are treated as + // constants. + // @param [in] feats The features, which must be the same + // as you provided to one of the calls to + // AccStats(). dimension is num-frames by + // feature-dimension. + // @param [in] post The posteriors, as provided to AccStats(). + // Its size() must equal feats.NumRows(). + // @param [in,out] feats_deriv The derivative of the objective + // function w.r.t. the input features. + // This function will *add to* feats_deriv, + // so it must have a well-defined value on + // entry. + void Backward(const MatrixBase &feats, + const Posterior &post, + const MatrixBase *feats_deriv); + private: + /* + Notes on implementation of GaussianEstimator. + Using Latex notation. + + We are estimating means \mu_i and variance-factors s_i (these + are scales on unit variances). Later we'll apply a kind of + interpolation with the global average variance, controlled + by variance_sharing_weight_, and we'll call the variances that + we finally output t_i. + + We formulate the sufficient statistics as: + the counts \gamma_i, the mean stats m_i and the (scalar) + variance stats v_i: + + \gamma_i = \sum_t \gamma_{t,i} + m_i = \sum_t \gamma_{t,i} x_t + v_i = \sum_t \gamma_{t,i} x_t^T x_t + The estimation procedure is: + \mu_i = \frac{m_i}{\gamma_i}, or 0 if \gamma_i is 0. + s_i = variance_floor if \gamma_i = 0, else: + max(variance_floor, v_i/\gamma_i - \mu_i^T \mu_i) + and another form more convenient for backprop: + = variance_floor if \gamma_i = 0, else: + max(variance_floor, v_i/\gamma_i - m_i^T m_i / \gamma_i^2) + + + We write \bar{foo} for a derivative of the objective function w.r.t. foo. + We are provided by the user with with \bar{\mu}_i and \bar{s}_i, when they + call SetOutputDerivs(). We first compute + \bar{m}_i and \bar{v}_i (the derivs w.r.t. the raw statistics) as follows: + \bar{m}_i = 0 if \gamma_i is 0, otherwise: + \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i m_i}{\gamma_i^2} + if s_i > variance_floor, else 0) + = or 0 if \gamma_i is 0, otherwise: + \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i \mu_i}{\gamma_i} + if s_i > variance_floor, else 0) + \bar{v}_i = 0 if \gamma_i is 0 or s_i equals variance_floor, otherwise: + \frac{\bar{s}_i}{\gamma_i} + \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t) + + + If 'variance_sharing_weight' != 0.0, then we need to modify the above. + Let the variance-floored version of the variance be t_i. + Write variance_sharing_weight as f (with 0 <= f <= 1), and let + \gamma = \sum_i \gamma_i. + Define the weighted-average variance: + s = \sum_i \frac{\gamma_i}{\gamma} s_i + and the partly-shared output variance is: + t_i = (1-f) s_i + f s. + For the backprop: If the user supplies derivatives \bar{t}_i, then: + \bar{s} = f \sum_i \bar{t}_i + \bar{s}_i = (1-f) \bar{t}_i + \frac{\gamma_i}{\gamma} \bar{s}. + */ + + + // gamma_, of dimension num_classes, contains the raw count statistics \gamma_i. + // It's added to when you call AccStats(). + Vector gamma_; + // m_ is the raw mean statistics (feature times soft-count); it's of dimension + // num_classes by feat_dim. + Matrix m_; + // v_ is the raw variance statistics (inner-product-of-feature times soft-count); + // it's of dimension num_classes. + Vector v_; + + // variance_floor_ and variance_sharing_weight_ are copies of the corresponding + // variables in class FmllrEstimatorOptions; they are set when Estimate() is called. + BaseFloat variance_floor_; + BaseFloat variance_sharing_weight_; + + // mu_ is the estimated means, which is set up when you call Estimate(). + Matrix mu_; + // s_ is the variances, after flooring by variance_floor_ but before + // applying variance_sharing_weight_. + Vector s_; + // t_ is the smoothed or maybe totally averaged-over-all-classes variances, + // derived from t as specified by variance_sharing_weight_. + Vector t_; + + // v_bar_, of dimension num_classes, contains \bar{v}_i. It's only set up + // after you call SetOutputDerivs(). + Vector v_bar_; + // m_bar_, of dimension num_classes by feature_dim, contains \bar{m}_i. + // It's only set up after you call SetOutputDerivs(). + Matrix m_bar_; + + +}; + + + +/** + Class FmllrEstimator encapsulates the whole of the fMLLR computation- for + a single speaker. See + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf + for a description of what is being implemented here. + + This class is suitable for use in training, where you want to backprop + through the computation; and also in test time (but not for the online + scenario; we may later rewrite a version that's optimized for that, or modify + this class to handle that). + + This class would normally be used as follows: + - Construct an instance of the class (probably for a particular speaker on + a particular minibatch). + + Then, either: + + - Call AccStats() one or more times. + - Call Estimate(). + - Call AdaptFeatures() one or more times to get the output features. + - Do something with those output features that (if you are training) + gives you some kind of objective-function derivative w.r.t. those + features. Then if you are training, do what's below: + - Call AdaptFeaturesBackward() one or more times to get part of the + derivative w.r.t. the input features. Note: the calls to AdaptFeatures() + and AdaptFeaturesBackward() may be interleaved, since the call to + AdaptFeatures() does not modify the object. + - Call EstimateBackward() + - Call AccStatsBackward() one or more times to get the part of the + derivative w.r.t. the input features that comes from the effect + on the transform itself. + - Make use of the calls GetMeanDeriv() and GetVarDeriv() to + account for the effect of the features on the class means and + variances (these will be passed to class GaussianEstimator, + and eventually to the features). + + Or: if there is only one training sequence, you can use the + simplified interface: after calling the constructor, + + - call ForwardCombined() + - call BackwardCombined() + - Make use of the calls GetMeanDeriv() and GetVarDeriv() to + account for the effect of the features on the class means and + variances, with the help of class GaussianEstimator. +*/ +class FmllrEstimator { + public: + /** + Constructor. + @param [in] opts Options class. This class makes a copy. + @param [in] mu Class means, probably as output by class + GaussianEstimator. This class maintains a + reference to this object, so you should ensure + that it exists for the lifetime of this object. + @param [in] s Scaling factors for spherical class + variances, probably as output by class + GaussianEstimator. As with mu, we store + a reference to it, so don't destroy or + change it as long as this class instance exists. + */ + FmllrEstimator(const FmllrEstimatorOptions &opts, + const MatrixBase &mu, + const VectorBase &s); + + + /** + Accumulate statistics to estimate the fMLLR transform. + @param [in] feats The feature matrix. A row of it would be called + x_t in the writeup in + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf. + @param [in] post The posteriors. post.size() must equal feats.NumRows(). + Each element of post is a list of pairs (i, p) where + i is the class label and p is the soft-count. + */ + void AccStats(const MatrixBase &feats, + const Posterior &post); + + + /** + Estimate the fMLLR transform parameters A and b. Returns the + objective-function improvement compared with A = I, b = 0, divided by the + total count as returned by TotalCount(). + */ + BaseFloat Estimate(); + + /// Returns the total count of the posteriors accumulated so far. + BaseFloat TotalCount(); + + /// Return the linear parameter matrix. Adapted features are + /// y_t = A x_t + b. You won't necessarily need to + /// call this, you can use ComputeAdaptedFeatures() intead. + const MatrixBase &GetLinearParams() { return A_; } + + /// Return the bias term b. + const VectorBase &GetBiasParams() { return b_; } + + /// Computes the adapted features y_t = A x_t + b. + /// feats (x) and adapted_feats (y) must have the same dimension. Must + /// only be called after Estimate() has been called. + /// 'adapted_feats' may contain NaN's on entry. + void AdaptFeatures(const MatrixBase &feats, + MatrixBase *adapted_feats) const; + + /** + This is the backward pass corresponding to the function AdaptFeatures(). + It propagates back only part of the derivative-- not including the part + that's due to how the transform changes when the features change. It + also accumulates within this class instance the derivative w.r.t. + A and b. You are expected to later call EstimateBackward() and + AccStatsBackward() to propagate the part of the derivative that comes from + the effect on the transform, back to the input features. + + See also AccStatsBackward(). + @param [in] feats The features (x) that were the original input to + AdaptFeatures(). + @param [in] adapted_feats_deriv The derivative \bar{y} w.r.t. the output (y) + that was the result of calling AdaptFeatures(). Must + have the same size as feat. + @param [in,out] feats_deriv The derivative w.r.t. 'feats'; this function + *adds* to it. + */ + void AdaptFeaturesBackward(const MatrixBase &feats, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv); + + /** + This is the backward pass corresponding to Estimate(). You call this after + calling AdaptFeaturesBackward() one or more times (which will accumulate + the derivative w.r.t. A and B). It backpropagates through the core + estimation procedure of fMLLR, in preparation for you calling + AccStatsBackward(). + */ + void EstimateBackward(); + + + // Returns the derivative w.r.t. the class means 'mu' that were supplied to the + // constructor. Must not be called until EstimateBackward() has been called. + const MatrixBase &GetMeanDeriv() const { return mu_bar_; } + // Returns the derivative w.r.t. the variance factors 's' that were supplied + // to the constructor. Must not be called until EstimateBackward() has been + // called. + const VectorBase &GetVarDeriv() const { return s_bar_; } + + /** + This is the backward pass corresponding to AccStats(). You call this after + calling EstimateBackward(). It computes the part of the derivative w.r.t. + 'feats' that comes from the effect on the transform parameters. You will + normally have previously called AdaptFeaturesBackward() on these same + features. + @param [in] feats The features as given to AccStats() + @param [in] post The posteriors as given to AccStats() + @param [in,out] feats_deriv This function *adds* to feats_deriv. + It adds the terms in \bar{x}_t that arise from + the derivative w.r.t. the transform parameters. The + "direct" term \bar{x}_t = A^T \bar{y}_t will have + previously been added by AdaptFeaturesBackward(). + */ + void AccStatsBackward(const MatrixBase &feats, + const Posterior &post, + MatrixBase *feats_deriv); + + /** + Combines AccStats(), Estimate() and AdaptFeatures() in one call; + for use when there is only one sequence. Returns the objective-function + improvement (per soft-count). + @param [in] feats The features we're estimating the fMLLR parameters from + @param [in] post The posteriors corresponding to 'feats + @param [out] adapted_feats A matrix the same size as 'feats', to which + the adapted features will be written. May contain + NaNs at entry. + */ + BaseFloat ForwardCombined(const MatrixBase &feats, + const Posterior &post, + MatrixBase *adapted_feats); + /** + Combines AdaptFeaturesBackward(), EstimateBackward(), and + AccStatsBackward(); for use when there is only one sequence. + Note: 'feats_deriv' is *added* to so must be defined at entry. + */ + void BackwardCombined(const MatrixBase &feats, + const Posterior &post, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv); + + ~FmllrEstimator(); + private: + + + ///////////// Fixed quantities passed in in the constructor /////////// + + // The options. + FmllrEstimatorOptions opts_; + // The means. A reference to an object owned elsewhere. + const MatrixBase &mu_; + // The variance factors (the variances are s_(i) times I). A reference to an + // object owned elsewhere. + const VectorBase &s_; + + ///////////// Quantities that are accumulated in AccStats() /////////// + + // Counts per class; dimension is num_classes. Added to when AccStats() is + // called. gamma_(i) corresponds to \gamma_i in the write up; it's + // \gamma_i = \sum_t gamma_{t,i} + Vector gamma_; + + // This contains + // G = (\sum_t \hat{\gamma}_t x_t x_t^T ) - \hat{\gamma} n n^T. + // Before Estimate() is called, it won't contain the 2nd term, only the first. + Matrix G_; + + // This contains + // K = (\sum_{t,i} \hat{\gamma}_{t,i} \mu_i x_t^T) - \hat{\gamma} m n^T + // Before Estimate() is called, it won't contain the 2nd term, only the first. + Matrix K_; + + // After Estimate() is called, this will be the quantity: + // n = \frac{1}{\hat{\gamma}} \sum_t \hat{\gamma}_t x_t. + // Before Estimate() is called, this won't include the factor + // 1/\hat{\gamma}, so it will be just \sum_t \hat{\gamma}_t x_t. + Vector n_; + + + /////////// Quantities that are computed when Estimate() is called //////// + + // gamma_hat_ is the same as gamma_, but divided by the class-specific variance + // factor s_i. In the writeup it's \hat{\gamma}_i. + Vector gamma_hat_; + // gamma_hat_tot_ is gamma_hat_.Sum(). In the writeup it's \hat{\gamma}. + BaseFloat gamma_hat_tot_; + + + // The weighted-average of the means: + // m = \frac{1}{\hat{\gamma}} \sum_i \hat{\gamma}_i \mu_i + Vector m_; + + // The parameter matrix + Matrix A_; + // The offset term + Vector b_; + // The object we use to estimate A and b, and to backprop through that + // process. + CoreFmllrEstimator *estimator_; + + ////////// Quantities that are accumulated in AdaptFeaturesBackward() //////// + + // The derivative w.r.t. A. This is set when AdaptFeaturesBackward() is called, + // to: + // \bar{A} = \sum_t \bar{y}_t x_t^T + // and then when EstimateBackward() is called, we add the term from the estimation + // of b, which is: + // \bar{A} -= \bar{b} n^T + Matrix A_bar_; + + // The derivative w.r.t. b. This is set when AdaptFeaturesBackward() is called, + // to: \bar{b} = \sum_t \bar{y}_t. + Vector b_bar_; + + ////////// Quantities that are computed in EstimateBackward() //////// + + // The derivative w.r.t. G; computed by 'estimator_' + Matrix G_bar_; + // The derivative w.r.t. K; computed by 'estimator_'. + Matrix K_bar_; + + // The derivative w.r.t. n: + // \bar{n} = -\bar{A}^T b - 2\hat{\gamma} \bar{G} n - \hat{\gamma} \bar{K}^T m + Vector n_bar_; + + // The derivative w.r.t. m: + // \bar{m} = \bar{b} - \hat{\gamma} \bar{K} n + Vector m_bar_; + + // gamma_hat_tot_bar_ is \bar{\hat{\gamma}} in the writeup; + // it's: + // \bar{\hat{\gamma}} = - n^T \bar{G} n - m^t \bar{K} n + // - \frac{1}{\hat{\gamma}} (n^T \bar{n} + m^T \bar{m}) + BaseFloat gamma_hat_tot_bar_; + // gamma_hat_bar_ contains the quantities that we write as + // \bar{\hat{\gamma}}_i in the writeup. It's: + // \bar{\hat{\gamma}}_i = \bar{\hat{\gamma}} + \frac{1}{\hat{\gamma}} \mu_i^T \bar{m} + Vector gamma_hat_bar_; + + // Kt_bar_mu_ has the same dimension as mu_; the i'th row contains the + // quantity \bar{K}^T \mu_i. This is cached here to avoid a matrix multiplication + // during the backward pass. + Matrix Kt_bar_mu_; + + + //////////// Quantities that are written to in AccStatsBackward() /////////// + + // The i'th row contains the derivative w.r.t mu_i. + // In Estimate(), this is set to: + // \bar{\mu}_i = \frac{\hat{\gamma}_i}{\hat{\gamma}} \bar{m} + // and in AccStatsBackward(), we do: + // \bar{\mu}_i += \sum_t \hat{\gamma}_{t,i} \bar{K} x_t. + Matrix mu_bar_; + + /// s_bar_(i) contains the derivative w.r.t the variance factor s_i, + /// which we write in the writeup as \bar{s}_i. + /// It equals: \bar{s}_i = \frac{-1}{s_i^2} \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_{t,i} + /// \bar{\hat{\gamma}}_{t,i}, computed as a temporary, equals: + /// \bar{hat{\gamma}}_{t,i} = \mu_i^T \bar{K} x_t + \bar{\hat{\gamma}}_i + \bar{\hat{\gamma}}_t + /// where + /// \bar{\hat{\gamma}}_t = x_t^T \bar{G} x_t + \frac{1}{\hat{\gamma}} x_t^T \bar{n} + Vector s_bar_; + + + // There is another quantity that's updated by AccStatsBackward(), which is + // \bar{x}_t, the derivative w.r.t. x_t. AccStatsBackward() does not include + // the term \bar{x}_t = A^T \bar{y}_t. But it does include the rest of the + // terms, doing: + // \bar{x}_t += 2 \hat{\gamma}_t \bar{G} x_t + // + \sum_i \hat{\gamma}_{t,i} \bar{K}^T \mu_i + // + \frac{\hat{\gamma}_t}{\hat{\gamma}} \bar{n} + // There is no variable for this; it's a temporary. + +}; + + } // namespace differentiable_transform } // namespace kaldi diff --git a/src/transform/differentiable-transform.h b/src/transform/differentiable-transform.h new file mode 100644 index 00000000000..eda3b64db3f --- /dev/null +++ b/src/transform/differentiable-transform.h @@ -0,0 +1,688 @@ +// transform/differentiable-transform.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ +#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ + +#include + +#include "base/kaldi-common.h" +#include "util/kaldi-table.h" +#include "util/kaldi-holder.h" + +namespace kaldi { + + +namespace differentiable_transform { + +class MinibatchInfoItf { + public: + + virtual ~MinibatchInfoItf() { } +}; + + +class SpeakerStatsItf { + + virtual ~SpeakerStatsItf() { } +}; + + + +/** + This class is for speaker-dependent feature-space transformations -- + principally various varieties of fMLLR, including mean-only, diagonal and + block-diagonal versions -- which are intended for placement in the bottleneck + of a neural net. So code-wise, we'd have: bottom neural net, then transform, + then top neural net. The transform is designed to be differentiable, i.e. it + can be used during training to propagate derivatives from the top neural net + down to the bottom neural net. The reason this is non-trivial (i.e. why it's + not just a matrix multiplication) is that the value of the transform itself + depends on the features, and also on the speaker-independent statistics for + each class (i.e. the mean and variance), which also depends on the features. + You can view this as an extension of things like BatchNorm, except the + interface is more complicated because there is a dependence on the per-frame + class labels. + + The class labels we'll use here will probably be derived from some kind of + minimal tree, with hundreds instead of thousands of states. Part of the + reason for using a smaller number of states is that, to make the thing + properly differentiable during training, we need to use a small enough number + of states that we can obtain a reasonable estimate for the mean and variance + of a Gaussian for each one in training time. Anyway, see + http://isl.anthropomatik.kit.edu/pdf/Nguyen2017.pdf, it's generally better + for this kind of thing to use "simple target models" for adaptation. + + Note: for training utterances we'll generally get the class labels used for + adatpation in a supervised manner, either by aligning a previous system like + a GMM system, or from the (soft) posteriors of the the numerator graphs. In + test time, we'll usually be getting these class labels from some kind of + unsupervised process. + + Because we tend to train neural nets on fairly small fixed-size chunks + (e.g. 1.5 seconds), and transforms like fMLLR don't tend to work very well + until you have about 5 seconds of data, we will usually be arranging those + chunks into groups where all members of the group comes from the same + speaker. + */ +class DifferentiableTransform { + public: + + /// Return the dimension of the input and output features. + virtual int32 Dim() const = 0; + + + /// Return the number of classes in the model used for adaptation. These + /// will probably correspond to the leaves of a small tree, so they would + /// be pdf-ids. This model only keeps track of the number of classes, + /// it does not contain any information about what they mean. The + /// integers in the objects of type Posterior provided to this class + /// are expected to contain numbers from 0 to NumClasses() - 1. + int32 NumClasses() const { return num_classes_; } + + + /// This can be used to change the number of classes. It would normally be + /// used, if at all, after the model is trained and prior to calling + /// Accumulate(), in case you want to use a more detailed model (e.g. the + /// normal-size tree instead of the small one that we use during training). + /// Child classes may want to override this, in case they need to do + /// something more than just set this variable. + virtual void SetNumClasses(int32 num_classes) { num_classes_ = num_classes; } + + /** + This is the function you call in training time, for the forward + pass; it adapts the features. By "training time" here, we + assume you are training the 'bottom' neural net, that produces + the features in 'input'; if you were not training it, it would + be the same as test time as far as this function is concerned. + + @param [in] input The original, un-adapted features; these + will typically be output by a neural net, the 'bottom' net in our + terminology. This will correspond to a whole minibatch, + consisting of multiple speakers and multiple sequences (chunks) + per speaker. Caution: the order of both the input and + output features, and the posteriors, does not consist of blocks, + one per sequence, but rather blocks, one per time frame, so the + sequences are intercalated. + @param [in] num_chunks The number of individual sequences + (e.g., chunks of speech) represented in 'input'. + input.NumRows() will equal num_sequences times the number + of time frames. + @param [in] num_spk The number of speakers. Must be greater than one, and + must divide num_chunks. The number of chunks per speaker + (num_chunks / num_spk) must be the same for all speakers, and the + chunks for a speaker must be consecutive. + @param [in] posteriors (note: this is a vector of vector of + pair). This provides, in 'soft-count' + form, the class supervision information that is used for the + adaptation. posteriors.size() will be equal to input.NumRows(), + and the ordering of its elements is the same as the ordering + of the rows of input, i.e. the sequences are intercalated. + There is no assumption that the posteriors sum to one; + this allows you to do things like silence weighting. + @param [out] output The adapted output. This matrix should have the + same dimensions as 'input'. + @return This function returns either NULL or an object of type + DifferentiableTransformItf*, which is expected to be given + to the function TrainingBackward(). It will store + any information that will be needed in the backprop phase. + */ + virtual MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const = 0; + + + /** + This does the backpropagation, during the training pass. + + @param [in] input The original input (pre-transform) features that + were given to TrainingForward(). + @param [in] output_deriv The derivative of the objective function + (that we are backpropagating) w.r.t. the output. + @param [in] num_chunks,num_spk,posteriors + See TrainingForward() for information + about these arguments; they should be the same + values. + @param [in] minibatch_info The object returned by the corresponding + call to TrainingForward(). The caller + will likely want to delete that object after + calling this function + @param [in,out] input_deriv The derivative at the input, i.e. + dF/d(input), where F is the function we are + evaluating. Must have the same dimension as + 'input'. The derivative is *added* to here. + This is useful because generally we will also + be training (perhaps with less weight) on + the unadapted features, in order to prevent them + from deviating too far from the adapted ones + and to allow the same model to be used for the + first pass. + */ + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + const MinibatchInfoItf &minibatch_info, + CuMatrixBase *input_deriv) const = 0; + + + /** + Returns the number of times you have to (call Accumulate() on a subset + of data, then call Estimate()) + */ + virtual int32 NumFinalIterations() = 0; + + /** + This will typically be called sequentially, minibatch by minibatch, + for a subset of training data, after training the neural nets, + followed by a call to Estimate(). Accumulate() stores statistics + that are used by Estimate(). This process is analogous to + computing the final stats in BatchNorm, in preparation for testing. + In practice it will be doing things like computing per-class means + and variances. + + @param [in] final_iter An iteration number in the range + [0, NumFinalIterations()]. In many cases there will + be only one iteration so this will just be zero. + + The input parameters are the same as the same-named parameters to + TrainingForward(); please refer to the documentation there. + */ + virtual void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) = 0; + + // To be called after repeated alls to Accumulate(), does any estimation that + // is required in training time (normally per-speaker means and possibly + // variances. + virtual void Estimate(int32 final_iter) = 0; + + // Returns an object representing sufficient statistics for estimating a + // speaker-dependent transform. This object will initially have zero + // counts in its statistics. It will represent the stats for a single + // speaker. + virtual SpeakerStatsItf *GetEmptySpeakerStats() = 0; + + + // Accumulate statistics for a segment of test data, storing them in the + // object 'speaker_stats'. There is no assumption that the soft-counts in + // 'posteriors' are positive; this allows you to change your mind about the + // traceback, in test-time, by subtracting the stats that you no longer want + // to use. + virtual void TestingAccumulate( + const MatrixBase &input, + const Posterior &posteriors, + SpeakerStatsItf *speaker_stats) const = 0; + + // Applies the transformation implied by the statistics in 'speaker_stats' to + // 'input', storing in the result in 'output'. It will do any estimation + // procedure that is required first, if applicable. + virtual void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const = 0; + + + // Read transform from stream (works out its type). Dies on error. + static DifferentiableTransform* ReadNew(std::istream &is, bool binary); + + // Copies transform (deep copy). + virtual DifferentiableTransform* Copy() const = 0; + + // Returns a new transform of the given type e.g. "MeanNormalize", + // or NULL if no such component type exists. + static DifferentiableTransform *NewTransformOfType(const std::string &type); + + // Write transform to stream + virtual void Write(std::ostream &os, bool binary) const = 0; + + // Reads transform from stream (normally you would previously have created + // the transform object of the correct type using ReadNew(). + virtual void Read(std::istream &is, bool binary) = 0; + + protected: + int32 num_classes_; + + +}; + + +/** + This is a version of the transform class that does nothing. It's potentially + useful for situations where you want to apply speaker normalization to some + dimensions of the feature vector but not to others. + */ +class NoOpTransform: public DifferentiableTransform { + public: + + int32 Dim() const override { return dim_; } + int32 NumClasses() const override { return num_classes_; } + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override { + output->CopyFromMat(input); + return NULL; + } + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + const MinibatchInfoItf &minibatch_info, + CuMatrixBase *input_deriv) const override { + input_deriv->AddMat(1.0, output_deriv); + } + + virtual int32 NumFinalIterations() { return 0; } + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override { } + + + + SpeakerStatsItf *GetEmptySpeakerStats() override { return NULL; } + + void TestingAccumulate( + const MatrixBase &input, + const Posterior &posteriors, + SpeakerStatsItf *speaker_stats) const override { } + void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) override { + output->CopyFromMat(input); + } + + void Estimate(int32 final_iter) override { } + + NoOpTransform(const NoOpTransform &other): + dim_(other.dim_), num_classes_(other.num_classes_) { } + + DifferentiableTransform* Copy() const override { + return new NoOpTransform(*this); + } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + private: + int32 dim_; + int32 num_classes_; +}; + + +/** + This is a version of the transform class that does a sequence of other + transforms, specified by other instances of the DifferentiableTransform + interface. + */ +class SequenceTransform: public DifferentiableTransform { + public: + + int32 Dim() const override; + int32 SetNumClasses() const override; + + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + const MinibatchInfoItf &minibatch_info, + CuMatrixBase *input_deriv) const override; + + virtual int32 NumFinalIterations(); + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + SpeakerStatsItf *GetEmptySpeakerStats() override; + + void TestingAccumulate( + const MatrixBase &input, + const Posterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; + + virtual void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) override; + + void Estimate(int32 final_iter) override; + + SequenceTransform(const SequenceTransform &other); + + DifferentiableTransform* Copy() const override { + return new SequenceTransform(*this); + } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + private: + std::vector transforms_; +}; + + +/** + This is a version of the transform class that consists of a number of + other transforms, appended dimension-wise-- e.g. this could be used to + implement block-diagonal fMLLR, or a structure where some dimensions are + adapted and some are not. + */ +class AppendTransform: public DifferentiableTransform { + public: + + int32 Dim() const override; + int32 SetNumClasses() const override; + + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + const MinibatchInfoItf &minibatch_info, + CuMatrixBase *input_deriv) const override; + + virtual int32 NumFinalIterations(); + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + virtual void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) override; + + void Estimate(int32 final_iter) override; + + AppendTransform(const AppendTransform &other); + + DifferentiableTransform* Copy() const override { + return new AppendTransform(*this); + } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + private: + std::vector transforms_; +}; + + + +/** + This is a version of the transform class that appends over sub-ranges + of dimensions, so that, for instance, you can implement a block-diagonal + transform or a setup where some dimensions are transformed and some are + not. +*/ +class AppendTransform: public DifferentiableTransform { + int32 Dim() const override; + int32 NumClasses() const override; + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override { + output->CopyFromMat(input); + return NULL; + } + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + const MinibatchInfoItf &minibatch_info, + CuMatrixBase *input_deriv) const override; + + void Accumulate( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + void Estimate() override { } + + AppendTransform(const AppendTransform &other); + + DifferentiableTransform* Copy() const override; + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + private: + std::vector transforms_; +}; + + +/** + This version of the transform class does a mean normalization: adding an + offset to its input so that the difference (per speaker) of the transformed + class means from the speaker-independent class means is minimized. + This is like a mean-only fMLLR with fixed (say, unit) covariance model. + */ +class SimpleMeanTransform: public DifferentiableTransform { + public: + int32 Dim() const override; + int32 NumClasses() const override; + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override { + output->CopyFromMat(input); + return NULL; + } + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + const MinibatchInfoItf &minibatch_info, + CuMatrixBase *input_deriv) const override; + + void Accumulate( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + virtual void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) override; + + + void Estimate() override { } + + AppendTransform(const AppendTransform &other); + + DifferentiableTransform* Copy() const override; + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + private: + + // OK: how to compute stats + class MinibatchInfo: public MinibatchInfoItf { + + // Stores the total weights, per frame, that correspond to the Posteriors + // supplied to TrainingForward(). + CuVector frame_weights; + + // The total of frame_weights. + BaseFloat total_weight; + }; + + // dim_ is the feature dimension + int32 dim_; + + // The class-dependent means. Dimension is num_classes_ by dim_. + // Note: these will not be set up during training, they will only + // be set up after calling Accumulate() and Estimate(), which happens + // in test time. + CuMatrix means_; + + // mean_stats_ and count_ are used in Accumulate() to accumulate + // statistics to adapt the mean. + CuMatrix mean_stats_; + double count_; + +}; + + +/** + Notes on the math behind differentiable fMLLR transform. + + */ + +class FmllrTransform: public DifferentiableTransform { + public: + int32 Dim() const override; + int32 NumClasses() const override; + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + const MinibatchInfoItf &minibatch_info, + CuMatrixBase *input_deriv) const override; + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + SpeakerStatsItf *GetEmptySpeakerStats() override; + + void TestingAccumulate( + const MatrixBase &input, + const Posterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; + + virtual void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) override; + + void Estimate(int32 final_iter) override { } + + FmllrTransform(const FmllrTransform &other); + + DifferentiableTransform* Copy() const override; + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + private: + + // OK: how to compute stats + class MinibatchInfo: public MinibatchInfoItf { + + // Stores the total weights, per frame, that correspond to the Posteriors + // supplied to TrainingForward(). frame_weights.Dim() equals + // input.NumRows(). + CuVector frame_weights; + + // The total of frame_weights per speaker. + CuVector frame_weights; + + BaseFloat total_weight; + }; + + class SpeakerStats: public SpeakerStatsItf { + + }; + + // dim_ is the feature dimension + int32 dim_; + + // The class-dependent means. Dimension is num_classes_ by dim_. + // Note: these will not be set up during training, they will only + // be set up after calling Accumulate() and Estimate(), which happens + // in test time. + CuMatrix means_; + + // mean_stats_ and count_ are used in Accumulate() to accumulate + // statistics to adapt the mean. + CuMatrix mean_stats_; + double count_; + +}; + + +} // namespace differentiable_transform +} // namespace kaldi + +#endif // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ From 49fb31320c2d936fc0965076711676f36c916bc9 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 28 Nov 2018 20:19:22 -0500 Subject: [PATCH 24/87] [src] Commit working version before making some changes. --- src/transform/differentiable-fmllr-test.cc | 48 ++++++++-- src/transform/differentiable-fmllr.cc | 40 ++++++-- src/transform/differentiable-fmllr.h | 101 ++++++++++++++++++++- 3 files changed, 168 insertions(+), 21 deletions(-) diff --git a/src/transform/differentiable-fmllr-test.cc b/src/transform/differentiable-fmllr-test.cc index 9d4ab6c9cd5..e2dfdad9318 100644 --- a/src/transform/differentiable-fmllr-test.cc +++ b/src/transform/differentiable-fmllr-test.cc @@ -42,6 +42,7 @@ void TestCoreFmllrEstimatorKDeriv( Matrix A_deriv(dim, dim); // A_deriv defines the objective function: a random linear function in A. A_deriv.SetRandn(); + A_deriv.Add(0.1); // Introduce some asymmetry. Matrix G_deriv(dim, dim), K_deriv(dim, dim); @@ -90,6 +91,7 @@ void TestCoreFmllrEstimatorGDeriv( Matrix A_deriv(dim, dim); // A_deriv defines the objective function: a random linear function in A. A_deriv.SetRandn(); + A_deriv.Add(0.1); // Introduce some asymmetry. Matrix G_deriv(dim, dim), K_deriv(dim, dim); @@ -148,6 +150,13 @@ void UnitTestCoreFmllrEstimatorSimple() { } } +static void InitRandNonsingular(MatrixBase *M) { + do { + M->SetRandn(); + } while (M->Cond() > 50.0); +} + + void UnitTestCoreFmllrEstimatorGeneral() { int32 dim = RandInt(10, 20); BaseFloat gamma = RandInt(5, 10); @@ -156,12 +165,12 @@ void UnitTestCoreFmllrEstimatorGeneral() { { // make sure G is symmetric and +ve definite. - Matrix A(dim, dim + 5); + Matrix A(dim, dim + 10); A.SetRandn(); G.AddMatMat(gamma, A, kNoTrans, A, kTrans, 0.0); } - K.SetRandn(); + InitRandNonsingular(&K); K.Scale(gamma); FmllrEstimatorOptions opts; CoreFmllrEstimator estimator(opts, gamma, G, K, &A); @@ -253,6 +262,7 @@ void TestFmllrEstimatorMeanDerivs(const MatrixBase &feats, Matrix adapted_feats_deriv(T, dim), feats_deriv(T, dim); adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); @@ -265,12 +275,16 @@ void TestFmllrEstimatorMeanDerivs(const MatrixBase &feats, // measure the accuracy of the deriv in 4 random directions. int32 n = 4; - BaseFloat epsilon = 1.0e-03; + BaseFloat epsilon = 1.0e-04; Vector expected_changes(n), actual_changes(n); for (int32 i = 0; i < n; i++) { Matrix new_mu(num_classes, dim, kUndefined), new_adapted_feats(T, dim, kUndefined); new_mu.SetRandn(); + // adding a systematic component helps the test to succeed in low precision. + for (int32 c = 0; c < num_classes; c++) { + new_mu.Row(c).Add(0.1 * RandInt(-1, 1)); + } new_mu.Scale(epsilon); expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); new_mu.AddMat(1.0, mu); @@ -312,6 +326,9 @@ void TestFmllrEstimatorVarDerivs(const MatrixBase &feats, Matrix adapted_feats_deriv(T, dim), feats_deriv(T, dim); adapted_feats_deriv.SetRandn(); + // Adding a systematic component to the derivative makes the test easier + // to pass, as the derivs are less random. + adapted_feats_deriv.AddMat(0.1, feats); f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); @@ -324,7 +341,7 @@ void TestFmllrEstimatorVarDerivs(const MatrixBase &feats, // measure the accuracy of the deriv in 10 random directions int32 n = 10; - BaseFloat epsilon = 0.1; + BaseFloat epsilon = 0.01; Vector expected_changes(n), actual_changes(n); for (int32 i = 0; i < n; i++) { Vector new_s(num_classes, kUndefined); @@ -371,6 +388,7 @@ void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, Matrix adapted_feats_deriv(T, dim), feats_deriv(T, dim); adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); @@ -407,14 +425,17 @@ void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, void UnitTestGaussianAndFmllrEstimator() { - int32 num_classes = RandInt(50, 100), - dim = RandInt(5, 10), - num_frames = RandInt(40 * num_classes, 100 * num_classes); + // It's important that the number of classes be greater than the dimension, or + // we would get a low-rank K. + int32 num_classes = RandInt(30, 40), + dim = RandInt(10, 20), + num_frames = RandInt(20 * num_classes, 40 * num_classes); GaussianEstimator g(num_classes, dim); Matrix feats(num_frames, dim); feats.SetRandn(); + feats.Add(0.1); // Nonzero offset tests certain aspects of the code better. Posterior post(num_frames); for (int32 t = 0; t < num_frames; t++) { int32 n = RandInt(0, 2); @@ -426,15 +447,22 @@ void UnitTestGaussianAndFmllrEstimator() { } g.AccStats(feats, post); FmllrEstimatorOptions opts; - opts.variance_sharing_weight = 0.25 * RandInt(0, 4); // will try other values later. + // avoid setting variance_sharing_weight to 1.0; it's hard for the tests to + // succeed then, and there are valid reasons for that + opts.variance_sharing_weight = 0.25 * RandInt(0, 2); g.Estimate(opts); KALDI_LOG << "Means are: " << g.GetMeans() << ", vars are: " << g.GetVars(); + TestGaussianEstimatorDerivs(feats, post, opts, &g); - TestFmllrEstimatorFeatDerivs(feats, post, g); + if (RandInt(0, 1) == 0) { + opts.smoothing_count = 500.0; + } + TestFmllrEstimatorMeanDerivs(feats, post, g); + TestFmllrEstimatorFeatDerivs(feats, post, g); TestFmllrEstimatorVarDerivs(feats, post, g); } @@ -448,7 +476,7 @@ void UnitTestGaussianAndFmllrEstimator() { int main() { using namespace kaldi::differentiable_transform; - for (int32 i = 0; i < 5; i++) { + for (int32 i = 0; i < 50; i++) { UnitTestCoreFmllrEstimatorSimple(); UnitTestCoreFmllrEstimatorGeneral(); UnitTestGaussianAndFmllrEstimator(); diff --git a/src/transform/differentiable-fmllr.cc b/src/transform/differentiable-fmllr.cc index 645977ce606..6f2031b1dc2 100644 --- a/src/transform/differentiable-fmllr.cc +++ b/src/transform/differentiable-fmllr.cc @@ -468,6 +468,20 @@ void FmllrEstimator::AdaptFeaturesBackward( MatrixBase *feats_deriv) { KALDI_ASSERT(SameDim(feats, adapted_feats_deriv) && SameDim(feats, *feats_deriv)); + int32 rows_per_chunk = 100; + if (feats.NumRows() > rows_per_chunk) { + // Break it up into 100-frame chunks and recurse. This will reduce roundoff + // error due to the way we work with temporaries. + for (int32 offset = 0; offset < feats.NumRows(); offset += rows_per_chunk) { + int32 n = std::min(rows_per_chunk, feats.NumRows() - offset); + SubMatrix feats_deriv_part = feats_deriv->RowRange(offset, n); + AdaptFeaturesBackward(feats.RowRange(offset, n), + adapted_feats_deriv.RowRange(offset, n), + &feats_deriv_part); + } + return; + } + // in the writeup: \bar{x}_t <-- A^T \bar{y}_t. // In this implementation, x_t corresponds to a // row vector in feats and feats_deriv, so everything is @@ -513,9 +527,9 @@ void FmllrEstimator::EstimateBackward() { estimator_ = NULL; KALDI_ASSERT(G_bar_.IsSymmetric()); - // \bar{n} = - (\bar{A}^T b + 2\bar{G} n + \bar{K}^T m) + // \bar{n} = - (A^T \bar{b} + 2\bar{G} n + \bar{K}^T m) n_bar_.Resize(dim); - n_bar_.AddMatVec(-1.0, A_bar_, kTrans, b_, 0.0); + n_bar_.AddMatVec(-1.0, A_, kTrans, b_bar_, 0.0); n_bar_.AddMatVec(-2.0 * gamma_hat_tot_, G_bar_, kNoTrans, n_, 1.0); n_bar_.AddMatVec(-1.0 * gamma_hat_tot_, K_bar_, kTrans, m_, 1.0); @@ -553,8 +567,12 @@ void FmllrEstimator::AccStatsBackward( const Posterior &post, MatrixBase *feats_deriv) { KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); - int32 T = feats.NumRows(), num_classes = mu_.NumRows(); - Vector s_bar_temp(num_classes); + int32 T = feats.NumRows(), num_classes = mu_.NumRows(), + dim = mu_.NumCols(); + + // Use temporaries for s_bar_ and mu_bar_ to reduce roundoff error. + Vector s_bar(num_classes); + Matrix mu_bar(num_classes, dim); for (int32 t = 0; t < T; t++) { auto iter = post[t].begin(), end = post[t].end(); SubVector x_t(feats, t), @@ -564,7 +582,7 @@ void FmllrEstimator::AccStatsBackward( int32 i = iter->first; BaseFloat gamma_ti = iter->second, gamma_hat_ti = gamma_ti / s_(i); - SubVector mu_bar_i(mu_bar_, i); + SubVector mu_bar_i(mu_bar, i); // \bar{\mu}_i += \hat{\gamma}_{t,i} \bar{K} x_t. mu_bar_i.AddMatVec(gamma_hat_ti, K_bar_, kNoTrans, x_t, 1.0); gamma_hat_t += gamma_hat_ti; @@ -587,10 +605,18 @@ void FmllrEstimator::AccStatsBackward( double gamma_hat_bar_ti = VecMatVec(mu_i, K_bar_, x_t) + double(gamma_hat_bar_(i)) + double(gamma_hat_bar_t); // \bar{s}_i += \frac{-1}{s_i^2} \gamma_{t,i} \bar{\hat{\gamma}}_{t,i} - s_bar_temp(i) -= 1.0 / (s_(i) * s_(i)) * gamma_ti * gamma_hat_bar_ti; + s_bar(i) -= 1.0 / (s_(i) * s_(i)) * gamma_ti * gamma_hat_bar_ti; + } + if (t == T - 1 || (t > 0 && t % 200 == 0)) { + s_bar_.AddVec(1.0, s_bar); + mu_bar_.AddMat(1.0, mu_bar); + if (t < T - 1) { + s_bar.SetZero(); + mu_bar.SetZero(); + } } } - s_bar_.AddVec(1.0, s_bar_temp); + } BaseFloat FmllrEstimator::ForwardCombined( diff --git a/src/transform/differentiable-fmllr.h b/src/transform/differentiable-fmllr.h index bfbe5590732..81d46095c69 100644 --- a/src/transform/differentiable-fmllr.h +++ b/src/transform/differentiable-fmllr.h @@ -503,7 +503,7 @@ class FmllrEstimator { BaseFloat Estimate(); /// Returns the total count of the posteriors accumulated so far. - BaseFloat TotalCount(); + BaseFloat TotalCount() { return gamma_.Sum(); } /// Return the linear parameter matrix. Adapted features are /// y_t = A x_t + b. You won't necessarily need to @@ -553,11 +553,12 @@ class FmllrEstimator { // Returns the derivative w.r.t. the class means 'mu' that were supplied to the - // constructor. Must not be called until EstimateBackward() has been called. + // constructor. Must not be called until EstimateBackward() and + // AccStatsBackward() have been called. const MatrixBase &GetMeanDeriv() const { return mu_bar_; } // Returns the derivative w.r.t. the variance factors 's' that were supplied - // to the constructor. Must not be called until EstimateBackward() has been - // called. + // to the constructor. Must not be called until EstimateBackward() and + // AccStatsBackward() have been called. const VectorBase &GetVarDeriv() const { return s_bar_; } /** @@ -736,6 +737,98 @@ class FmllrEstimator { }; +/* MeanOnlyTransformEstimator is like a highly simplified version of + FmllrEstimator, where the transform is just y_t = x_t + b. + There are class means but the variances are assumed to be all + unit. (This is equivalent to assuming that they are all identical + with an arbitrary value; the value doesn't actually affect the + learned offset so we assume they are unit). + + The equations involved are like an extremly simplified version + of what we do in class FmllrEstimator, with m as a weighted + average of the means and n as a weighted average of the input + features. The weights come from the posterior information you + supply. + + This object has a similar interface to class FmllrEstimator. + + */ +class MeanOnlyTransformEstimator { + + /** + Constructor. + @param [in] mu Class means, probably as output by class + GaussianEstimator. This class maintains a + reference to this object, so you should ensure + that it exists for the lifetime of this object. + You can ignore the variances from class + GaussianEstimator; they are not used. + */ + MeanOnlyTransformEstimator(const MatrixBase &mu); + + /** + Accumulate statistics to estimate the fMLLR transform. + @param [in] feats The feature matrix. A row of it would be called + x_t in the writeup in + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf. + @param [in] post The posteriors. post.size() must equal feats.NumRows(). + Each element of post is a list of pairs (i, p) where + i is the class label and p is the soft-count. + */ + void AccStats(const MatrixBase &feats, + const Posterior &post); + + /** + Estimate the parameter (the offset b). Returns the + objective-function improvement compared with b = 0, divided by the + total count as returned by TotalCount(). + */ + BaseFloat Estimate(); + + BaseFloat TotalCount(); + + /// Return the bias term b. + const VectorBase &GetOffset() { return b_; } + + /// Computes the adapted features y_t = x_t + b. + /// feats (x) and adapted_feats (y) must have the same dimension. Must + /// only be called after Estimate() has been called. + /// 'adapted_feats' may contain NaN's on entry. + void AdaptFeatures(const MatrixBase &feats, + MatrixBase *adapted_feats) const; + + + /** + This is the backward pass corresponding to the function AdaptFeatures(). + It propagates back only part of the derivative-- not including the part + that's due to how the transform changes when the features change. It + also accumulates within this class instance the derivative w.r.t. + b. You are expected to later call EstimateBackward() and + AccStatsBackward() to propagate the part of the derivative that comes from + the effect on the transform, back to the input features. + + See also AccStatsBackward(). + @param [in] feats The features (x) that were the original input to + AdaptFeatures(). + @param [in] adapted_feats_deriv The derivative \bar{y} w.r.t. the output (y) + that was the result of calling AdaptFeatures(). Must + have the same size as feat. + @param [in,out] feats_deriv The derivative w.r.t. 'feats'; this function + *adds* to it. + */ + void AdaptFeaturesBackward(const MatrixBase &feats, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv); + + void EstimateBackward(); + // TODO: finish this. + + private: + + Vector b_; +}; + + } // namespace differentiable_transform } // namespace kaldi From 0d11246e114c4ae41ebd4b5feb1504c19c9179e5 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 30 Nov 2018 21:06:43 -0500 Subject: [PATCH 25/87] [src] Rework math for differentiable fMLLR for greater efficiency. --- src/transform/differentiable-fmllr.cc | 180 +++++++++++++++----------- src/transform/differentiable-fmllr.h | 79 ++++++----- 2 files changed, 141 insertions(+), 118 deletions(-) diff --git a/src/transform/differentiable-fmllr.cc b/src/transform/differentiable-fmllr.cc index 6f2031b1dc2..72cad1e4816 100644 --- a/src/transform/differentiable-fmllr.cc +++ b/src/transform/differentiable-fmllr.cc @@ -336,51 +336,47 @@ FmllrEstimator::FmllrEstimator(const FmllrEstimatorOptions &opts, gamma_.Resize(num_classes); G_.Resize(dim, dim); - K_.Resize(dim, dim); - n_.Resize(dim); + z_.Resize(num_classes, dim); } void FmllrEstimator::AccStats(const MatrixBase &feats, - const Posterior &post) { + const Posterior &post) { KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); int32 num_classes = mu_.NumRows(), dim = mu_.NumCols(), T = feats.NumRows(); + // Use temporaries for the stats and later add them to the stats in the class; // this will reduce roundoff errors if this function is called more than once. - // Also do this every 100 frames or so, again, to reduce roundoff. - SpMatrix G(dim); - Matrix K(dim, dim); - Vector gamma(num_classes), - n(dim); + Vector gamma_hat_t(T, kUndefined), + gamma(num_classes); + for (int32 t = 0; t < T; t++) { auto iter = post[t].begin(), end = post[t].end(); SubVector x_t(feats, t); - BaseFloat gamma_hat_t = 0.0; + BaseFloat this_gamma_hat_t = 0.0; for (; iter != end; ++iter) { int32 i = iter->first; BaseFloat gamma_ti = iter->second, gamma_hat_ti = gamma_ti / s_(i); - SubVector mu_i(mu_, i); + SubVector z_i(z_, i); + z_i.AddVec(gamma_ti, x_t); gamma(i) += gamma_ti; - gamma_hat_t += gamma_hat_ti; - K.AddVecVec(gamma_hat_ti, mu_i, x_t); - } - G.AddVec2(gamma_hat_t, x_t); - n.AddVec(gamma_hat_t, x_t); - - if (t == T - 1 || (t > 0 && t % 100 == 0)) { - gamma_.AddVec(1.0, gamma); - G_.AddSp(1.0, G); - K_.AddMat(1.0, K); - n_.AddVec(1.0, n); - if (t < T - 1) { - gamma.SetZero(); - G.SetZero(); - K.SetZero(); - n.SetZero(); - } + this_gamma_hat_t += gamma_hat_ti; } + gamma_hat_t(t) = this_gamma_hat_t; + } + gamma_.AddVec(1.0, gamma); + + SpMatrix G(dim); + int32 rows_per_chunk = 100; + for (int32 offset = 0; offset < T; offset += rows_per_chunk) { + int32 n_frames = std::min(rows_per_chunk, feats.NumRows() - offset); + SubMatrix feats_part(feats, offset, n_frames, 0, dim); + SubVector gamma_hat_t_part(gamma_hat_t, offset, n_frames); + // the 0.0 value for beta means we don't double-count stats. + G.AddMat2Vec(1.0, feats_part, kTrans, gamma_hat_t_part, 0.0); + G_.AddSp(1.0, G); } } @@ -398,18 +394,37 @@ BaseFloat FmllrEstimator::Estimate() { KALDI_ASSERT(gamma_tot > 0.0 && "You cannot call Estimate() with zero stats."); - gamma_hat_ = gamma_; - gamma_hat_.DivElements(s_); - gamma_hat_tot_ = gamma_hat_.Sum(); - n_.Scale(1.0 / gamma_hat_tot_); + Vector s_inv(s_); + s_inv.InvertElements(); + + // compute \hat{\gamma} = \sum_i \gamma_i / s_i + gamma_hat_tot_ = VecVec(gamma_, s_inv); + + // compute n = (1/\hat{\gamma}) \sum_i (1/s_i) z_i + n_.Resize(dim); + n_.AddMatVec(1.0 / gamma_hat_tot_, z_, kTrans, s_inv, 0.0); - m_.Resize(dim); - m_.AddMatVec(1.0 / gamma_hat_tot_, mu_, kTrans, gamma_hat_, 0.0); - K_.AddVecVec(-gamma_hat_tot_, m_, n_); + { // Set m = 1/\hat{\gamma} \sum_i (\gamma_i / s_i) \mu_i. + Vector s_inv_gamma(s_inv); + s_inv_gamma.MulElements(gamma_); + m_.Resize(dim); + m_.AddMatVec(1.0 / gamma_hat_tot_, mu_, kTrans, s_inv_gamma, 0.0); + } + + + { // Set K := \sum_i (1/s_i) \mu_i z_i^T - \hat{\gamma} m n^T + Matrix mu_s(mu_); + mu_s.MulRowsVec(s_inv); + K_.Resize(dim, dim); + K_.AddMatMat(1.0, mu_s, kTrans, z_, kNoTrans, 0.0); + K_.AddVecVec(-gamma_hat_tot_, m_, n_); + } + + // In AccStats(), we did G := \sum_t \hat{\gamma}_t x_t x_t^T. + // Now we do: G -= \hat{\gamma} n n^T G_.AddVecVec(-gamma_hat_tot_, n_, n_); - KALDI_ASSERT(G_.IsSymmetric(0.001)); - // Make sure G_ is perfectly symmetric, which, mathematically, it is. - G_.CopyLowerToUpper(); + KALDI_ASSERT(G_.IsSymmetric(0.0001)); + A_.Resize(dim, dim, kUndefined); BaseFloat gamma_tot_smoothed = gamma_tot; @@ -516,6 +531,12 @@ void FmllrEstimator::EstimateBackward() { KALDI_ASSERT(A_bar_.NumRows() != 0 && "You must call AdaptFeaturesBackward() before calling " "EstimateBackward()."); + + Vector s_inv(s_); + s_inv.InvertElements(); + Vector s_inv_gamma(s_inv); + s_inv_gamma.MulElements(gamma_); + // do \bar{A} -= \bar{b} n^T A_bar_.AddVecVec(-1.0, b_bar_, n_); @@ -533,33 +554,56 @@ void FmllrEstimator::EstimateBackward() { n_bar_.AddMatVec(-2.0 * gamma_hat_tot_, G_bar_, kNoTrans, n_, 1.0); n_bar_.AddMatVec(-1.0 * gamma_hat_tot_, K_bar_, kTrans, m_, 1.0); + // \bar{m} = \bar{b} - \hat{\gamma} \bar{K} n m_bar_ = b_bar_; m_bar_.AddMatVec(-gamma_hat_tot_, K_bar_, kNoTrans, n_, 1.0); + // \bar{z}_i = (1/s_i) \bar{K}^T \mu_i + 1/(s_i \hat{\gamma}) \bar{n} + z_bar_.Resize(num_classes, dim); + // set \bar{z}_i := \bar{K}^T \mu_i. It's transposed below. + z_bar_.AddMatMat(1.0, mu_, kNoTrans, K_bar_, kNoTrans, 0.0); + // \bar{z}_i += 1/\hat{\gamma} \bar{n} + z_bar_.AddVecToRows(1.0 / gamma_hat_tot_, n_bar_); + // \bar{z}_i /= s_i + z_bar_.MulRowsVec(s_inv); + // \bar{\hat{\gamma}} = - n^T \bar{G} n - m^t \bar{K} n // - \frac{1}{\hat{\gamma}} (n^T \bar{n} + m^T \bar{m}) gamma_hat_tot_bar_ = -1.0 * VecMatVec(n_, G_bar_, n_) - VecMatVec(m_, K_bar_, n_) - (1.0 / gamma_hat_tot_) * (VecVec(n_, n_bar_) + VecVec(m_, m_bar_)); - // \bar{\hat{\gamma}}_i = \bar{\hat{\gamma}} + \frac{1}{\hat{\gamma}} \mu_i^T \bar{m} - gamma_hat_bar_.Resize(num_classes, kUndefined); - gamma_hat_bar_.Set(gamma_hat_tot_bar_); - gamma_hat_bar_.AddMatVec(1.0 / gamma_hat_tot_, mu_, kNoTrans, m_bar_, 1.0); - - // each row of Kt_bar_mu_ will become \bar{K}^T \mu_i. But the - // expression is transposed below. - Kt_bar_mu_.Resize(num_classes, dim); - Kt_bar_mu_.AddMatMat(1.0, mu_, kNoTrans, K_bar_, kNoTrans, 0.0); - - // \bar{\mu}_i <-- \frac{\hat{\gamma}_i}{\gamma} \bar{m} - // we'll add another term to this later in AccStatsBackward(). + // Set \bar{mu}_i = (1/s_i) \bar{K} z_i + (\gamma_i / (s_i \hat{\gamma})) \bar{m} mu_bar_.Resize(num_classes, dim); - mu_bar_.AddVecVec(1.0 / gamma_hat_tot_, gamma_hat_, m_bar_); - - // s_bar_ will be written to in AccStatsBackward(), but we initialize it here. + mu_bar_.AddMatMat(1.0, z_, kNoTrans, K_bar_, kTrans, 0.0); + mu_bar_.MulRowsVec(s_inv); + mu_bar_.AddVecVec(1.0 / gamma_hat_tot_, s_inv_gamma, m_bar_); + + // Add all terms in \bar{s}_i except the one involving \bar{\hat{\gamma}}_t. + // The full equation (also present in the header) is: + // \bar{s}_i = -(1 / s_i^2) * ( + // \mu_i^T \bar{K} z_i + (1 / \hat{\gamma}) \z_i^T \bar{n} + // + (\gamma_i / \hat{\gamma}) \mu_i^T \bar{m} + \gamma_i \hat{\gamma} + // + \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_t ) + // Noticing that some expressions in it are common with \bar{\mu}_i, this can + // be simplified to: + // \bar{s}_i = (-1/s_i) \mu_i^T \bar{\mu}_i + // - (1/s_i^2) * ((1 / \hat{\gamma}) \z_i^T \bar{n} + \gamma_i \hat{\gamma} + // + \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_t ) s_bar_.Resize(num_classes); + // do s_bar_ -= (1 / \hat{\gamma}) \z_i^T \bar{n}. We'll later multiply by 1/s_i^2. + s_bar_.AddMatVec(-1.0 / gamma_hat_tot_, z_, kNoTrans, n_bar_, 0.0); + // do s_bar_(i) -= \gamma_i \bar{\hat{\gamma}} + s_bar_.AddVec(-1.0 * gamma_hat_tot_bar_, gamma_); + // do s_bar_(i) *= 1/s_i + s_bar_.MulElements(s_inv); + // do s_bar_(i) -= \mu_i^T \bar{\mu}_i + s_bar_.AddDiagMatMat(-1.0, mu_, kNoTrans, mu_bar_, kTrans, 1.0); + // do s_bar_(i) *= 1/s_i + s_bar_.MulElements(s_inv); + // OK, s_bar_ is now set up with all but the last term. It remains only to do: + // \bar{s}_i += (-1/s_i^2) \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_t ) } void FmllrEstimator::AccStatsBackward( @@ -567,12 +611,10 @@ void FmllrEstimator::AccStatsBackward( const Posterior &post, MatrixBase *feats_deriv) { KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); - int32 T = feats.NumRows(), num_classes = mu_.NumRows(), - dim = mu_.NumCols(); + int32 T = feats.NumRows(), num_classes = mu_.NumRows(); - // Use temporaries for s_bar_ and mu_bar_ to reduce roundoff error. + // Use temporaries for s_bar_, to reduce roundoff error. Vector s_bar(num_classes); - Matrix mu_bar(num_classes, dim); for (int32 t = 0; t < T; t++) { auto iter = post[t].begin(), end = post[t].end(); SubVector x_t(feats, t), @@ -582,41 +624,29 @@ void FmllrEstimator::AccStatsBackward( int32 i = iter->first; BaseFloat gamma_ti = iter->second, gamma_hat_ti = gamma_ti / s_(i); - SubVector mu_bar_i(mu_bar, i); - // \bar{\mu}_i += \hat{\gamma}_{t,i} \bar{K} x_t. - mu_bar_i.AddMatVec(gamma_hat_ti, K_bar_, kNoTrans, x_t, 1.0); gamma_hat_t += gamma_hat_ti; - SubVector Kt_bar_mu_i(Kt_bar_mu_, i); - // \bar{x}_t += \hat{\gamma}_{t,i} \bar{K}^T \mu_i - x_bar_t.AddVec(gamma_hat_ti, Kt_bar_mu_i); + SubVector z_bar_i(z_bar_, i); + // \bar{x}_t += \gamma_{t,i} \bar{z}_i + x_bar_t.AddVec(gamma_ti, z_bar_i); } - double gamma_hat_bar_t = VecMatVec(x_t, G_bar_, x_t) + - (1.0 / gamma_hat_tot_) * VecVec(x_t, n_bar_); + double gamma_hat_bar_t = VecMatVec(x_t, G_bar_, x_t); // \bar{x}_t += 2 \hat{\gamma}_t \bar{G} x_t x_bar_t.AddMatVec(2.0 * gamma_hat_t, G_bar_, kNoTrans, x_t, 1.0); - // \bar{x}_t += \frac{\hat{\gamma}_t}{\hat{\gamma}} \bar{n} - x_bar_t.AddVec(gamma_hat_t / gamma_hat_tot_, n_bar_); for (iter = post[t].begin(); iter != end; ++iter) { int32 i = iter->first; BaseFloat gamma_ti = iter->second; SubVector mu_i(mu_, i); - double gamma_hat_bar_ti = VecMatVec(mu_i, K_bar_, x_t) + - double(gamma_hat_bar_(i)) + double(gamma_hat_bar_t); - // \bar{s}_i += \frac{-1}{s_i^2} \gamma_{t,i} \bar{\hat{\gamma}}_{t,i} - s_bar(i) -= 1.0 / (s_(i) * s_(i)) * gamma_ti * gamma_hat_bar_ti; + // \bar{s}_i -= \frac{1}{s_i^2} \gamma_{t,i} \bar{\hat{\gamma}}_t + s_bar(i) -= 1.0 / (s_(i) * s_(i)) * gamma_ti * gamma_hat_bar_t; } if (t == T - 1 || (t > 0 && t % 200 == 0)) { s_bar_.AddVec(1.0, s_bar); - mu_bar_.AddMat(1.0, mu_bar); - if (t < T - 1) { + if (t < T - 1) s_bar.SetZero(); - mu_bar.SetZero(); - } } } - } BaseFloat FmllrEstimator::ForwardCombined( diff --git a/src/transform/differentiable-fmllr.h b/src/transform/differentiable-fmllr.h index 81d46095c69..3e9bac3b1f5 100644 --- a/src/transform/differentiable-fmllr.h +++ b/src/transform/differentiable-fmllr.h @@ -628,31 +628,30 @@ class FmllrEstimator { // Before Estimate() is called, it won't contain the 2nd term, only the first. Matrix G_; - // This contains - // K = (\sum_{t,i} \hat{\gamma}_{t,i} \mu_i x_t^T) - \hat{\gamma} m n^T - // Before Estimate() is called, it won't contain the 2nd term, only the first. - Matrix K_; - - // After Estimate() is called, this will be the quantity: - // n = \frac{1}{\hat{\gamma}} \sum_t \hat{\gamma}_t x_t. - // Before Estimate() is called, this won't include the factor - // 1/\hat{\gamma}, so it will be just \sum_t \hat{\gamma}_t x_t. - Vector n_; + // This is of dimension num_classes by dim (same as mu_). It contains + // the weighted sums of the input data, for each class: + // z_i = \sum_t \gamma_{t,i} x_i. + Matrix z_; /////////// Quantities that are computed when Estimate() is called //////// - // gamma_hat_ is the same as gamma_, but divided by the class-specific variance - // factor s_i. In the writeup it's \hat{\gamma}_i. - Vector gamma_hat_; - // gamma_hat_tot_ is gamma_hat_.Sum(). In the writeup it's \hat{\gamma}. + // gamma_hat_tot_ is the total of gamma_(i) / s_(i), i.e. + // \hat{\gamma} = \sum_i gamma_i / s_i. BaseFloat gamma_hat_tot_; + // After Estimate() is called, this will be the quantity: + // n = \frac{1}{\hat{\gamma}} \sum_i (1/s_i) z_i + Vector n_; // The weighted-average of the means: - // m = \frac{1}{\hat{\gamma}} \sum_i \hat{\gamma}_i \mu_i + // m = \frac{1}{\hat{\gamma}} \sum_i (\gamma_i/s_i) \mu_i Vector m_; + // This contains + // K = (\sum_i (1/s_i) \mu_i z_i^T) - \hat{\gamma} m n^T + Matrix K_; + // The parameter matrix Matrix A_; // The offset term @@ -683,56 +682,50 @@ class FmllrEstimator { Matrix K_bar_; // The derivative w.r.t. n: - // \bar{n} = -\bar{A}^T b - 2\hat{\gamma} \bar{G} n - \hat{\gamma} \bar{K}^T m + // \bar{n} = -A^T \bar{b} - 2\hat{\gamma} \bar{G} n - \hat{\gamma} \bar{K}^T m Vector n_bar_; // The derivative w.r.t. m: // \bar{m} = \bar{b} - \hat{\gamma} \bar{K} n Vector m_bar_; + // The derivative w.r.t the z_i quantities. The i'th row is: + // \bar{z}_i = (1/s_i) \bar{K}^T \mu_i + 1/(s_i \hat{\gamma}) \bar{n} + Matrix z_bar_; + // gamma_hat_tot_bar_ is \bar{\hat{\gamma}} in the writeup; // it's: // \bar{\hat{\gamma}} = - n^T \bar{G} n - m^t \bar{K} n // - \frac{1}{\hat{\gamma}} (n^T \bar{n} + m^T \bar{m}) BaseFloat gamma_hat_tot_bar_; - // gamma_hat_bar_ contains the quantities that we write as - // \bar{\hat{\gamma}}_i in the writeup. It's: - // \bar{\hat{\gamma}}_i = \bar{\hat{\gamma}} + \frac{1}{\hat{\gamma}} \mu_i^T \bar{m} - Vector gamma_hat_bar_; - - // Kt_bar_mu_ has the same dimension as mu_; the i'th row contains the - // quantity \bar{K}^T \mu_i. This is cached here to avoid a matrix multiplication - // during the backward pass. - Matrix Kt_bar_mu_; - - - //////////// Quantities that are written to in AccStatsBackward() /////////// // The i'th row contains the derivative w.r.t mu_i. - // In Estimate(), this is set to: - // \bar{\mu}_i = \frac{\hat{\gamma}_i}{\hat{\gamma}} \bar{m} - // and in AccStatsBackward(), we do: - // \bar{\mu}_i += \sum_t \hat{\gamma}_{t,i} \bar{K} x_t. + // This is: + // \bar{\mu}_i = (1/s_i) \bar{K} z_i + (\gamma_i / (s_i \hat{\gamma})) \bar{m} Matrix mu_bar_; - /// s_bar_(i) contains the derivative w.r.t the variance factor s_i, - /// which we write in the writeup as \bar{s}_i. - /// It equals: \bar{s}_i = \frac{-1}{s_i^2} \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_{t,i} - /// \bar{\hat{\gamma}}_{t,i}, computed as a temporary, equals: - /// \bar{hat{\gamma}}_{t,i} = \mu_i^T \bar{K} x_t + \bar{\hat{\gamma}}_i + \bar{\hat{\gamma}}_t - /// where - /// \bar{\hat{\gamma}}_t = x_t^T \bar{G} x_t + \frac{1}{\hat{\gamma}} x_t^T \bar{n} - Vector s_bar_; + //////////// Quantities that are written to in AccStatsBackward() /////////// + // s_bar_(i) contains the derivative w.r.t the variance factor s_i, + // which we write in the writeup as \bar{s}_i. + // It is: + // \bar{s}_i = -(1 / s_i^2) * ( + // \mu_i^T \bar{K} z_i + (1 / \hat{\gamma}) \z_i^T \bar{n} + // + (\gamma_i / \hat{\gamma}) \mu_i^T \bar{m} + \gamma_i \bar{\hat{\gamma}} + // + \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_t ) + // where + // \bar{\hat{\gamma}}_t = x_t^T \bar{G} x_t . + // Note: we add all but the first terms during Estimate(), and only the one + // with \sum_t in it in AccStatsBackward. + Vector s_bar_; // There is another quantity that's updated by AccStatsBackward(), which is // \bar{x}_t, the derivative w.r.t. x_t. AccStatsBackward() does not include // the term \bar{x}_t = A^T \bar{y}_t. But it does include the rest of the // terms, doing: // \bar{x}_t += 2 \hat{\gamma}_t \bar{G} x_t - // + \sum_i \hat{\gamma}_{t,i} \bar{K}^T \mu_i - // + \frac{\hat{\gamma}_t}{\hat{\gamma}} \bar{n} - // There is no variable for this; it's a temporary. + // + \sum_i \gamma_{t,i} \bar{z}_i + // There is no member variable for this; it's a temporary. }; From da36067d54948dffb24d679d0b642259dbc28816 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 4 Dec 2018 20:14:50 -0500 Subject: [PATCH 26/87] [src] Add MeanOnlyTransformEstimator and tests for it; misc. fixes. --- src/transform/differentiable-fmllr-test.cc | 145 +++++++++++++++++-- src/transform/differentiable-fmllr.cc | 136 +++++++++++++++++- src/transform/differentiable-fmllr.h | 153 ++++++++++++++++++--- 3 files changed, 399 insertions(+), 35 deletions(-) diff --git a/src/transform/differentiable-fmllr-test.cc b/src/transform/differentiable-fmllr-test.cc index e2dfdad9318..84b37d8f993 100644 --- a/src/transform/differentiable-fmllr-test.cc +++ b/src/transform/differentiable-fmllr-test.cc @@ -203,12 +203,13 @@ void TestGaussianEstimatorDerivs(const MatrixBase &feats, } else { KALDI_LOG << "Testing var derivs."; var_derivs.SetRandn(); + var_derivs.Add(0.2); // Nonzero mean makes the test easier to pass } g->SetOutputDerivs(mean_derivs, var_derivs); Matrix feats_deriv(feats.NumRows(), feats.NumCols()); - g->Backward(feats, post, &feats_deriv); + g->AccStatsBackward(feats, post, &feats_deriv); - BaseFloat epsilon = 1.0e-03; + BaseFloat epsilon = 1.0e-04; for (int32 i = 0; i < n; i++) { Matrix new_feats(feats.NumRows(), @@ -405,6 +406,8 @@ void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, Matrix new_feats(T, dim, kUndefined), new_adapted_feats(T, dim, kUndefined); new_feats.SetRandn(); + new_feats.Add(RandGauss()); // will help to test whether the indirect + // part of the derivative is accurate. new_feats.Scale(epsilon); expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); new_feats.AddMat(1.0, feats); @@ -424,7 +427,122 @@ void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, } -void UnitTestGaussianAndFmllrEstimator() { +void TestMeanOnlyTransformEstimatorMeanDerivs( + const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + MeanOnlyTransformEstimator m(mu); + + Matrix adapted_feats(T, dim, kUndefined); + m.ForwardCombined(feats, post, &adapted_feats); + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const MatrixBase &mu_deriv = m.GetMeanDeriv(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_mu(num_classes, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_mu.SetRandn(); + // adding a systematic component helps the test to succeed in low precision. + for (int32 c = 0; c < num_classes; c++) { + new_mu.Row(c).Add(0.1 * RandInt(-1, 1)); + } + new_mu.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); + new_mu.AddMat(1.0, mu); + MeanOnlyTransformEstimator m2(new_mu); + m2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void TestMeanOnlyTransformEstimatorFeatDerivs( + const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + int32 T = feats.NumRows(), dim = feats.NumCols(); + const MatrixBase &mu(g.GetMeans()); + + + MeanOnlyTransformEstimator m(mu); + + Matrix adapted_feats(T, dim, kUndefined); + m.ForwardCombined(feats, post, &adapted_feats); + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_feats(T, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_feats.SetRandn(); + new_feats.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); + new_feats.AddMat(1.0, feats); + MeanOnlyTransformEstimator m2(mu); + m2.ForwardCombined(new_feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void UnitTestGaussianAndEstimators() { // It's important that the number of classes be greater than the dimension, or // we would get a low-rank K. int32 num_classes = RandInt(30, 40), @@ -435,7 +553,7 @@ void UnitTestGaussianAndFmllrEstimator() { Matrix feats(num_frames, dim); feats.SetRandn(); - feats.Add(0.1); // Nonzero offset tests certain aspects of the code better. + feats.Add(0.2); // Nonzero offset tests certain aspects of the code better. Posterior post(num_frames); for (int32 t = 0; t < num_frames; t++) { int32 n = RandInt(0, 2); @@ -461,9 +579,20 @@ void UnitTestGaussianAndFmllrEstimator() { opts.smoothing_count = 500.0; } - TestFmllrEstimatorMeanDerivs(feats, post, g); - TestFmllrEstimatorFeatDerivs(feats, post, g); - TestFmllrEstimatorVarDerivs(feats, post, g); + { // test FmllrEstimator + TestFmllrEstimatorMeanDerivs(feats, post, g); + TestFmllrEstimatorFeatDerivs(feats, post, g); + TestFmllrEstimatorVarDerivs(feats, post, g); + } + + { // test MeanOnlyTransformEstimator. + TestMeanOnlyTransformEstimatorMeanDerivs(feats, post, g); + TestMeanOnlyTransformEstimatorFeatDerivs(feats, post, g); + } + + + + } @@ -479,7 +608,7 @@ int main() { for (int32 i = 0; i < 50; i++) { UnitTestCoreFmllrEstimatorSimple(); UnitTestCoreFmllrEstimatorGeneral(); - UnitTestGaussianAndFmllrEstimator(); + UnitTestGaussianAndEstimators(); } std::cout << "Test OK.\n"; } diff --git a/src/transform/differentiable-fmllr.cc b/src/transform/differentiable-fmllr.cc index 72cad1e4816..3aa1df8e829 100644 --- a/src/transform/differentiable-fmllr.cc +++ b/src/transform/differentiable-fmllr.cc @@ -202,7 +202,8 @@ GaussianEstimator::GaussianEstimator(int32 num_classes, int32 feature_dim): void GaussianEstimator::AccStats(const MatrixBase &feats, const Posterior &post) { KALDI_ASSERT(static_cast(post.size()) == feats.NumRows()); - int32 T = feats.NumRows(); + int32 T = feats.NumRows(), + num_classes = m_.NumRows(); auto iter = post.begin(); for (int32 t = 0; t < T; t++,++iter) { SubVector feat(feats, t); @@ -211,6 +212,8 @@ void GaussianEstimator::AccStats(const MatrixBase &feats, end2 = this_post.end(); for (; iter2 != end2; ++iter2) { int32 i = iter2->first; + KALDI_ASSERT(i >= 0 && i < num_classes && + "Posteriors and adaptation model mismatch"); BaseFloat p = iter2->second; gamma_(i) += p; SubVector this_m(m_, i); @@ -299,9 +302,10 @@ int32 GaussianEstimator::Dim() const { return std::max(m_.NumCols(), mu_.NumCols()); } -void GaussianEstimator::Backward(const MatrixBase &feats, - const Posterior &post, - const MatrixBase *feats_deriv) { +void GaussianEstimator::AccStatsBackward( + const MatrixBase &feats, + const Posterior &post, + const MatrixBase *feats_deriv) { // The equation we're implementing is: // \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t) // See the comment in the header: @@ -357,6 +361,8 @@ void FmllrEstimator::AccStats(const MatrixBase &feats, BaseFloat this_gamma_hat_t = 0.0; for (; iter != end; ++iter) { int32 i = iter->first; + KALDI_ASSERT(i >= 0 && i < num_classes && + "Posteriors and adaptation model mismatch"); BaseFloat gamma_ti = iter->second, gamma_hat_ti = gamma_ti / s_(i); SubVector z_i(z_, i); @@ -482,7 +488,8 @@ void FmllrEstimator::AdaptFeaturesBackward( const MatrixBase &adapted_feats_deriv, MatrixBase *feats_deriv) { KALDI_ASSERT(SameDim(feats, adapted_feats_deriv) && - SameDim(feats, *feats_deriv)); + SameDim(feats, *feats_deriv) && + G_bar_.NumRows() == 0); int32 rows_per_chunk = 100; if (feats.NumRows() > rows_per_chunk) { // Break it up into 100-frame chunks and recurse. This will reduce roundoff @@ -673,5 +680,124 @@ FmllrEstimator::~FmllrEstimator() { delete estimator_; // in case Estimate() was never called. } + +MeanOnlyTransformEstimator::MeanOnlyTransformEstimator( + const MatrixBase &mu): mu_(mu) { + int32 num_classes = mu_.NumRows(), + dim = mu_.NumCols(); + gamma_.Resize(num_classes); + input_sum_.Resize(dim); +} + +void MeanOnlyTransformEstimator::AccStats(const MatrixBase &feats, + const Posterior &post) { + int32 T = feats.NumRows(), + num_classes = mu_.NumRows(); + KALDI_ASSERT(static_cast(post.size()) == T); + + for (int32 t = 0; t < T; t++) { + BaseFloat gamma_t = 0.0; // Total weight for this frame. + auto iter = post[t].begin(), end = post[t].end(); + for (; iter != end; ++iter) { + int32 i = iter->first; + KALDI_ASSERT(i >= 0 && i < num_classes && + "Posteriors and adaptation model mismatch"); + BaseFloat gamma_ti = iter->second; + gamma_t += gamma_ti; + gamma_(i) += gamma_ti; + } + SubVector feat(feats, t); + KALDI_ASSERT(gamma_t >= 0); + input_sum_.AddVec(gamma_t, feat); + } +} + + +void MeanOnlyTransformEstimator::Estimate() { + double tot_gamma = gamma_.Sum(); + int32 dim = mu_.NumCols(); + if (tot_gamma <= 0.0) + KALDI_ERR << "You cannot call Estimate() if total count is zero."; + Vector gamma_float(gamma_); + Vector expected_mean(dim); + expected_mean.AddMatVec(1.0 / tot_gamma, mu_, kTrans, gamma_float, 0.0); + // basically: offset_ = expected_mean - observed_mean, + // where observed_mean = input_sum_ / tot_gamma. + offset_ = expected_mean; + offset_.AddVec(-1.0 / tot_gamma, input_sum_); + output_deriv_sum_.Resize(dim); +} + +void MeanOnlyTransformEstimator::AdaptFeatures( + const MatrixBase &feats, + MatrixBase *adapted_feats) const { + adapted_feats->CopyRowsFromVec(offset_); + adapted_feats->AddMat(1.0, feats); +} + +void MeanOnlyTransformEstimator::AdaptFeaturesBackward( + const MatrixBase &feats, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv) { + int32 dim = mu_.NumCols(); + Vector output_deriv_sum(dim); + output_deriv_sum.AddRowSumMat(1.0, adapted_feats_deriv); + output_deriv_sum_.AddVec(1.0, output_deriv_sum); + feats_deriv->AddMat(1.0, adapted_feats_deriv); +} + +void MeanOnlyTransformEstimator::EstimateBackward() { + int32 num_classes = mu_.NumRows(), dim = mu_.NumCols(); + mu_bar_.Resize(num_classes, dim); + Vector gamma(gamma_), + output_deriv_sum(output_deriv_sum_); + BaseFloat gamma_tot = gamma_.Sum(); + KALDI_ASSERT(gamma_tot > 0.0); + mu_bar_.AddVecVec(1.0 / gamma_tot, gamma, output_deriv_sum); + + x_deriv_ = output_deriv_sum; + x_deriv_.Scale(-1.0 / gamma_tot); +} + + +void MeanOnlyTransformEstimator::AccStatsBackward( + const MatrixBase &feats, + const Posterior &post, + MatrixBase *feats_deriv) { + + int32 T = feats.NumRows(); + // tot_weight will be the total weight of the posteriors in 'post' + // for each frame. + Vector tot_weight(T, kUndefined); + for (int32 t = 0; t < T; t++) { + BaseFloat gamma_t = 0.0; // Total weight for this frame. + auto iter = post[t].begin(), end = post[t].end(); + for (; iter != end; ++iter) + gamma_t += iter->second; + tot_weight(t) = gamma_t; + } + feats_deriv->AddVecVec(1.0, tot_weight, x_deriv_); +} + +void MeanOnlyTransformEstimator::ForwardCombined( + const MatrixBase &feats, + const Posterior &post, + MatrixBase *adapted_feats) { + AccStats(feats, post); + Estimate(); + AdaptFeatures(feats, adapted_feats); +} + +void MeanOnlyTransformEstimator::BackwardCombined( + const MatrixBase &feats, + const Posterior &post, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv) { + AdaptFeaturesBackward(feats, adapted_feats_deriv, feats_deriv); + EstimateBackward(); + AccStatsBackward(feats, post, feats_deriv); +} + + } // namespace differentiable_transform } // namespace kaldi diff --git a/src/transform/differentiable-fmllr.h b/src/transform/differentiable-fmllr.h index 3e9bac3b1f5..df086109f6e 100644 --- a/src/transform/differentiable-fmllr.h +++ b/src/transform/differentiable-fmllr.h @@ -255,6 +255,18 @@ class CoreFmllrEstimator { lead to excessive roundoff if you had a large amount of data. We'll later on create a separate mechanism for accumulating stats over all the data, given the full tree. + + The normal usage pattern would be: + - Construct the object. + - Call AccStats() for each sequence. + - Call Estimate() + - Call GetMeans() and GetVars() to obtain the means and vars, and do + something with them, e.g. compute some kind of objective, from which + you would obtain derivatives w.r.t. those means and vars. + - Call SetOutputDerivs() to tell this class what those derivatives w.r.t. + the means and vars are. + - Call AccStatsBackward() for each sequence to propagate the derivatives + back to the features that were used to estimate the means and vars. */ class GaussianEstimator { public: @@ -322,9 +334,9 @@ class GaussianEstimator { // This function will *add to* feats_deriv, // so it must have a well-defined value on // entry. - void Backward(const MatrixBase &feats, - const Posterior &post, - const MatrixBase *feats_deriv); + void AccStatsBackward(const MatrixBase &feats, + const Posterior &post, + const MatrixBase *feats_deriv); private: /* Notes on implementation of GaussianEstimator. @@ -354,12 +366,12 @@ class GaussianEstimator { We write \bar{foo} for a derivative of the objective function w.r.t. foo. We are provided by the user with with \bar{\mu}_i and \bar{s}_i, when they - call SetOutputDerivs(). We first compute - \bar{m}_i and \bar{v}_i (the derivs w.r.t. the raw statistics) as follows: - \bar{m}_i = 0 if \gamma_i is 0, otherwise: + call SetOutputDerivs(); and we aim to compute \bar{m}_i and \bar{v}_i, which + are the derivs w.r.t. the raw statistics. This is done as follows: + \bar{m}_i = 0 if \gamma_i is 0, otherwise: \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i m_i}{\gamma_i^2} if s_i > variance_floor, else 0) - = or 0 if \gamma_i is 0, otherwise: + = or 0 if \gamma_i is 0, otherwise: \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i \mu_i}{\gamma_i} if s_i > variance_floor, else 0) \bar{v}_i = 0 if \gamma_i is 0 or s_i equals variance_floor, otherwise: @@ -745,9 +757,42 @@ class FmllrEstimator { This object has a similar interface to class FmllrEstimator. + This class would normally be used as follows: + - Construct an instance of the class (probably for a particular speaker on + a particular minibatch). + + Then, either: + + - Call AccStats() one or more times. + - Call Estimate(). + - Call AdaptFeatures() one or more times to get the output features. + - Do something with those output features that (if you are training) + gives you some kind of objective-function derivative w.r.t. those + features. Then if you are training, do what's below: + - Call AdaptFeaturesBackward() one or more times to get part of the + derivative w.r.t. the input features. Note: the calls to AdaptFeatures() + and AdaptFeaturesBackward() may be interleaved, since the call to + AdaptFeatures() does not modify the object. + - Call EstimateBackward() + - Call AccStatsBackward() one or more times to get the part of the + derivative w.r.t. the input features that comes from the effect + on the transform itself. + - Make use of the calls GetMeanDeriv() and GetVarDeriv() to + account for the effect of the features on the class means and + variances (these will be passed to class GaussianEstimator, + and eventually to the features). + + Or: if there is only one training sequence, you can use the +o simplified interface: after calling the constructor, + + - call ForwardCombined() + - call BackwardCombined() + - Make use of the call GetMeanDeriv() to account for the effect of the + features on the class means and variances, with the help of class + GaussianEstimator. */ class MeanOnlyTransformEstimator { - + public: /** Constructor. @param [in] mu Class means, probably as output by class @@ -772,16 +817,15 @@ class MeanOnlyTransformEstimator { const Posterior &post); /** - Estimate the parameter (the offset b). Returns the - objective-function improvement compared with b = 0, divided by the - total count as returned by TotalCount(). + Estimate the parameter (the offset b). Requires the total count to be + nonzero. */ - BaseFloat Estimate(); + void Estimate(); - BaseFloat TotalCount(); + BaseFloat TotalCount() { return gamma_.Sum(); } /// Return the bias term b. - const VectorBase &GetOffset() { return b_; } + const VectorBase &GetOffset() { return offset_; } /// Computes the adapted features y_t = x_t + b. /// feats (x) and adapted_feats (y) must have the same dimension. Must @@ -794,13 +838,11 @@ class MeanOnlyTransformEstimator { /** This is the backward pass corresponding to the function AdaptFeatures(). It propagates back only part of the derivative-- not including the part - that's due to how the transform changes when the features change. It - also accumulates within this class instance the derivative w.r.t. - b. You are expected to later call EstimateBackward() and - AccStatsBackward() to propagate the part of the derivative that comes from - the effect on the transform, back to the input features. - + that's due to how the offset changes when the features change. It + also accumulates within this class instance the derivative w.r.t. the + offset. See also AccStatsBackward(). + @param [in] feats The features (x) that were the original input to AdaptFeatures(). @param [in] adapted_feats_deriv The derivative \bar{y} w.r.t. the output (y) @@ -813,12 +855,79 @@ class MeanOnlyTransformEstimator { const MatrixBase &adapted_feats_deriv, MatrixBase *feats_deriv); + /** + Backward pass corresponding to Estimate(). Should be called after + you've called AdaptFeatures() on all utterances. Computes the + derivatives w.r.t. the mean. */ void EstimateBackward(); - // TODO: finish this. + + /** + Returns the derivative w.r.t. the class means 'mu' that were supplied to + the constructor. Must not be called until EstimateBackward() has been + called. */ + const MatrixBase &GetMeanDeriv() const { return mu_bar_; } + + /** + This is the backward pass corresponding to AccStats(). You call this after + calling EstimateBackward(). It computes the part of the derivative w.r.t. + 'feats' that comes from the effect on the transform parameters. You will + normally have previously called AdaptFeaturesBackward() on these same + features. + @param [in] feats The features as given to AccStats() + @param [in,out] feats_deriv This function *adds* to feats_deriv. + It adds the terms in \bar{x}_t that arise from + the derivative w.r.t. the offset b. + */ + void AccStatsBackward(const MatrixBase &feats, + const Posterior &post, + MatrixBase *feats_deriv); + + + /** + Combines AccStats(), Estimate() and AdaptFeatures() in one call; + for use when there is only one sequence. + @param [in] feats The features we're estimating the fMLLR parameters from + @param [in] post The posteriors corresponding to 'feats + @param [out] adapted_feats A matrix the same size as 'feats', to which + the adapted features will be written. May contain + NaNs at entry. + */ + void ForwardCombined(const MatrixBase &feats, + const Posterior &post, + MatrixBase *adapted_feats); + /** + Combines AdaptFeaturesBackward(), EstimateBackward(), and + AccStatsBackward(); for use when there is only one sequence. + Note: 'feats_deriv' is *added* to so must be defined at entry. + */ + void BackwardCombined(const MatrixBase &feats, + const Posterior &post, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv); private: + // The means, one row per class. A reference to an object owned elsewhere. + const MatrixBase &mu_; - Vector b_; + // The counts per class + Vector gamma_; + // The total of the input features, weighted by total posterior. + Vector input_sum_; + + // The offset. + Vector offset_; + + // The total of the derivative w.r.t. the output. + Vector output_deriv_sum_; + + // The derivative w.r.t. each row of the input features-- i.e. the part of the + // derivative that comes from the effect via the offset. This equals + // (-1 / total-count) * output_deriv_sum_. + Vector x_deriv_; + + // The derivative w.r.t. mu: + // (1/gamma_tot) gamma_ . output_deriv_sum_^T. + Matrix mu_bar_; }; From e265781dc1c2ccf4778d7daefd3fefd445c02977 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 4 Dec 2018 21:49:59 -0500 Subject: [PATCH 27/87] [Small fixes to DifferentiableTransform stuff.] --- src/transform/differentiable-transform.h | 40 ++++++++++++++---------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/transform/differentiable-transform.h b/src/transform/differentiable-transform.h index eda3b64db3f..b1482590c8c 100644 --- a/src/transform/differentiable-transform.h +++ b/src/transform/differentiable-transform.h @@ -56,7 +56,8 @@ class SpeakerStatsItf { down to the bottom neural net. The reason this is non-trivial (i.e. why it's not just a matrix multiplication) is that the value of the transform itself depends on the features, and also on the speaker-independent statistics for - each class (i.e. the mean and variance), which also depends on the features. + each class (i.e. the mean and variance), which also depend on the features + sicne we estimate them from the same minibatch. You can view this as an extension of things like BatchNorm, except the interface is more complicated because there is a dependence on the per-frame class labels. @@ -65,21 +66,22 @@ class SpeakerStatsItf { minimal tree, with hundreds instead of thousands of states. Part of the reason for using a smaller number of states is that, to make the thing properly differentiable during training, we need to use a small enough number - of states that we can obtain a reasonable estimate for the mean and variance - of a Gaussian for each one in training time. Anyway, see + of states that we can obtain a reasonable estimate for the mean and (spherical) + variance of a Gaussian for each one in training time. Anyway, as you can see in http://isl.anthropomatik.kit.edu/pdf/Nguyen2017.pdf, it's generally better - for this kind of thing to use "simple target models" for adaptation. + for this kind of thing to use "simple target models" for adaptation rather than + very complex models. Note: for training utterances we'll generally get the class labels used for adatpation in a supervised manner, either by aligning a previous system like - a GMM system, or from the (soft) posteriors of the the numerator graphs. In - test time, we'll usually be getting these class labels from some kind of - unsupervised process. + a GMM system, or-- more likely-- from the (soft) posteriors of the the + numerator graphs. In test time, we'll usually be getting these class labels + from some kind of unsupervised process. Because we tend to train neural nets on fairly small fixed-size chunks (e.g. 1.5 seconds), and transforms like fMLLR don't tend to work very well until you have about 5 seconds of data, we will usually be arranging those - chunks into groups where all members of the group comes from the same + chunks into groups where all members of the group come from the same speaker. */ class DifferentiableTransform { @@ -120,7 +122,8 @@ class DifferentiableTransform { per speaker. Caution: the order of both the input and output features, and the posteriors, does not consist of blocks, one per sequence, but rather blocks, one per time frame, so the - sequences are intercalated. + sequences are intercalated. This is the default order; + see operator < of nnet3::Index. @param [in] num_chunks The number of individual sequences (e.g., chunks of speech) represented in 'input'. input.NumRows() will equal num_sequences times the number @@ -138,11 +141,13 @@ class DifferentiableTransform { There is no assumption that the posteriors sum to one; this allows you to do things like silence weighting. @param [out] output The adapted output. This matrix should have the - same dimensions as 'input'. + same dimensions as 'input'. It does not have to be free of + NaNs when you call this function. @return This function returns either NULL or an object of type - DifferentiableTransformItf*, which is expected to be given + DifferentiableTransformItf*, which is expected to later be given to the function TrainingBackward(). It will store - any information that will be needed in the backprop phase. + any information that needs to be remembered for the backward + phase. */ virtual MinibatchInfoItf* TrainingForward( const CuMatrixBase &input, @@ -163,8 +168,8 @@ class DifferentiableTransform { See TrainingForward() for information about these arguments; they should be the same values. - @param [in] minibatch_info The object returned by the corresponding - call to TrainingForward(). The caller + @param [in] minibatch_info The pointer returned by the corresponding + call to TrainingForward() (may be NULL). The caller will likely want to delete that object after calling this function @param [in,out] input_deriv The derivative at the input, i.e. @@ -184,7 +189,7 @@ class DifferentiableTransform { int32 num_chunks, int32 num_spk, const Posterior &posteriors, - const MinibatchInfoItf &minibatch_info, + const MinibatchInfoItf *minibatch_info, CuMatrixBase *input_deriv) const = 0; @@ -217,9 +222,12 @@ class DifferentiableTransform { int32 num_spk, const Posterior &posteriors) = 0; - // To be called after repeated alls to Accumulate(), does any estimation that + // To be called after repeated calls to Accumulate(), does any estimation that // is required in training time (normally per-speaker means and possibly // variances. + // @param [in] final_iter An iteration number in the range + // [0, NumFinalIterations()]. In many cases there will + // be only one iteration so this will just be zero. virtual void Estimate(int32 final_iter) = 0; // Returns an object representing sufficient statistics for estimating a From a974225b6de2a0c661a7312d94b47c74bc4f20ab Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 4 Dec 2018 22:13:15 -0500 Subject: [PATCH 28/87] [src] more drafting interaces --- src/transform/differentiable-transform.h | 46 +++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/transform/differentiable-transform.h b/src/transform/differentiable-transform.h index b1482590c8c..d3799c9e856 100644 --- a/src/transform/differentiable-transform.h +++ b/src/transform/differentiable-transform.h @@ -87,10 +87,9 @@ class SpeakerStatsItf { class DifferentiableTransform { public: - /// Return the dimension of the input and output features. + /// Return the dimension of the features this operates on. virtual int32 Dim() const = 0; - /// Return the number of classes in the model used for adaptation. These /// will probably correspond to the leaves of a small tree, so they would /// be pdf-ids. This model only keeps track of the number of classes, @@ -169,9 +168,10 @@ class DifferentiableTransform { about these arguments; they should be the same values. @param [in] minibatch_info The pointer returned by the corresponding - call to TrainingForward() (may be NULL). The caller - will likely want to delete that object after - calling this function + call to TrainingForward() (may be NULL). This function + takes possession of the pointer. If for some reason the + backward pass was not done, the caller will likely + want to delete it themselves. @param [in,out] input_deriv The derivative at the input, i.e. dF/d(input), where F is the function we are evaluating. Must have the same dimension as @@ -189,7 +189,7 @@ class DifferentiableTransform { int32 num_chunks, int32 num_spk, const Posterior &posteriors, - const MinibatchInfoItf *minibatch_info, + MinibatchInfoItf *minibatch_info, CuMatrixBase *input_deriv) const = 0; @@ -289,7 +289,7 @@ class NoOpTransform: public DifferentiableTransform { public: int32 Dim() const override { return dim_; } - int32 NumClasses() const override { return num_classes_; } + MinibatchInfoItf* TrainingForward( const CuMatrixBase &input, int32 num_chunks, @@ -305,8 +305,9 @@ class NoOpTransform: public DifferentiableTransform { int32 num_chunks, int32 num_spk, const Posterior &posteriors, - const MinibatchInfoItf &minibatch_info, + const MinibatchInfoItf *minibatch_info, CuMatrixBase *input_deriv) const override { + KALDI_ASSERT(minibatch_info == NULL); input_deriv->AddMat(1.0, output_deriv); } @@ -320,13 +321,13 @@ class NoOpTransform: public DifferentiableTransform { const Posterior &posteriors) override { } - SpeakerStatsItf *GetEmptySpeakerStats() override { return NULL; } void TestingAccumulate( const MatrixBase &input, const Posterior &posteriors, SpeakerStatsItf *speaker_stats) const override { } + void TestingForward( const MatrixBase &input, const SpeakerStatsItf &speaker_stats, @@ -337,7 +338,8 @@ class NoOpTransform: public DifferentiableTransform { void Estimate(int32 final_iter) override { } NoOpTransform(const NoOpTransform &other): - dim_(other.dim_), num_classes_(other.num_classes_) { } + DifferentiableTransform(other), + dim_(other.dim_) { } DifferentiableTransform* Copy() const override { return new NoOpTransform(*this); @@ -349,7 +351,6 @@ class NoOpTransform: public DifferentiableTransform { private: int32 dim_; - int32 num_classes_; }; @@ -357,6 +358,8 @@ class NoOpTransform: public DifferentiableTransform { This is a version of the transform class that does a sequence of other transforms, specified by other instances of the DifferentiableTransform interface. + + TODO: finish this. */ class SequenceTransform: public DifferentiableTransform { public: @@ -418,8 +421,9 @@ class SequenceTransform: public DifferentiableTransform { /** - This is a version of the transform class that consists of a number of - other transforms, appended dimension-wise-- e.g. this could be used to + This is a version of the transform class that consists of a number of other + transforms, appended dimension-wise, so its feature dimension is the sum of + the dimensions of the constituent transforms-- e.g. this could be used to implement block-diagonal fMLLR, or a structure where some dimensions are adapted and some are not. */ @@ -441,7 +445,7 @@ class AppendTransform: public DifferentiableTransform { int32 num_chunks, int32 num_spk, const Posterior &posteriors, - const MinibatchInfoItf &minibatch_info, + MinibatchInfoItf *minibatch_info, CuMatrixBase *input_deriv) const override; virtual int32 NumFinalIterations(); @@ -477,12 +481,11 @@ class AppendTransform: public DifferentiableTransform { /** - This is a version of the transform class that appends over sub-ranges - of dimensions, so that, for instance, you can implement a block-diagonal - transform or a setup where some dimensions are transformed and some are - not. + This is a version of the transform class that implements fMLLR (with + spherical variances, to make the update equations non-iterative); see + differentiable-fmllr.h. */ -class AppendTransform: public DifferentiableTransform { +class FmllrTransform: public DifferentiableTransform { int32 Dim() const override; int32 NumClasses() const override; MinibatchInfoItf* TrainingForward( @@ -519,7 +522,10 @@ class AppendTransform: public DifferentiableTransform { void Read(std::istream &is, bool binary) override; private: - std::vector transforms_; + int32 dim_; + + // TODO: class means and variances for when the model has been trained. + }; From e06e51ee62c623916b92309014d73c515b54654c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 4 Dec 2018 22:40:12 -0500 Subject: [PATCH 29/87] [src] Move around sources, create adapt/ dir. --- src/adapt/Makefile | 22 +++++++++++++++++++ .../differentiable-fmllr-test.cc | 0 .../differentiable-fmllr.cc | 0 .../differentiable-fmllr.h | 0 .../differentiable-transform.h | 0 5 files changed, 22 insertions(+) create mode 100644 src/adapt/Makefile rename src/{transform => adapt}/differentiable-fmllr-test.cc (100%) rename src/{transform => adapt}/differentiable-fmllr.cc (100%) rename src/{transform => adapt}/differentiable-fmllr.h (100%) rename src/{transform => adapt}/differentiable-transform.h (100%) diff --git a/src/adapt/Makefile b/src/adapt/Makefile new file mode 100644 index 00000000000..67e5b78fb10 --- /dev/null +++ b/src/adapt/Makefile @@ -0,0 +1,22 @@ +all: + +include ../kaldi.mk + +TESTFILES = regtree-fmllr-diag-gmm-test lda-estimate-test \ + regression-tree-test fmllr-diag-gmm-test \ + regtree-mllr-diag-gmm-test fmpe-test fmllr-raw-test \ + differentiable-fmllr-test + +OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \ + regtree-fmllr-diag-gmm.o cmvn.o transform-common.o fmllr-diag-gmm.o \ + lvtln.o mllt.o fmpe.o basis-fmllr-diag-gmm.o \ + compressed-transform-stats.o fmllr-raw.o decodable-am-diag-gmm-regtree.o \ + differentiable-fmllr.o + + +LIBNAME = kaldi-transform + +ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \ + ../matrix/kaldi-matrix.a ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/transform/differentiable-fmllr-test.cc b/src/adapt/differentiable-fmllr-test.cc similarity index 100% rename from src/transform/differentiable-fmllr-test.cc rename to src/adapt/differentiable-fmllr-test.cc diff --git a/src/transform/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc similarity index 100% rename from src/transform/differentiable-fmllr.cc rename to src/adapt/differentiable-fmllr.cc diff --git a/src/transform/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h similarity index 100% rename from src/transform/differentiable-fmllr.h rename to src/adapt/differentiable-fmllr.h diff --git a/src/transform/differentiable-transform.h b/src/adapt/differentiable-transform.h similarity index 100% rename from src/transform/differentiable-transform.h rename to src/adapt/differentiable-transform.h From aa88ec09869d75e2ba95c9ee462ca5e7b883a3e1 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 4 Dec 2018 22:48:09 -0500 Subject: [PATCH 30/87] [src] some fixes after moving things. --- src/adapt/Makefile | 19 ++++++------------- src/adapt/differentiable-fmllr-test.cc | 4 ++-- src/adapt/differentiable-fmllr.cc | 4 ++-- src/adapt/differentiable-fmllr.h | 2 +- src/adapt/differentiable-transform.h | 2 +- 5 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/adapt/Makefile b/src/adapt/Makefile index 67e5b78fb10..b7df9b00ce9 100644 --- a/src/adapt/Makefile +++ b/src/adapt/Makefile @@ -2,21 +2,14 @@ all: include ../kaldi.mk -TESTFILES = regtree-fmllr-diag-gmm-test lda-estimate-test \ - regression-tree-test fmllr-diag-gmm-test \ - regtree-mllr-diag-gmm-test fmpe-test fmllr-raw-test \ - differentiable-fmllr-test +TESTFILES = differentiable-fmllr-test -OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \ - regtree-fmllr-diag-gmm.o cmvn.o transform-common.o fmllr-diag-gmm.o \ - lvtln.o mllt.o fmpe.o basis-fmllr-diag-gmm.o \ - compressed-transform-stats.o fmllr-raw.o decodable-am-diag-gmm-regtree.o \ - differentiable-fmllr.o +OBJFILES = differentiable-fmllr.o +LIBNAME = kaldi-adapt -LIBNAME = kaldi-transform - -ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a +ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \ + ../matrix/kaldi-matrix.a ../util/kaldi-util.a \ + ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/adapt/differentiable-fmllr-test.cc b/src/adapt/differentiable-fmllr-test.cc index 84b37d8f993..5977f7be5e8 100644 --- a/src/adapt/differentiable-fmllr-test.cc +++ b/src/adapt/differentiable-fmllr-test.cc @@ -1,4 +1,4 @@ -// transform/differentiable-fmllr-test.cc +// adapt/differentiable-fmllr-test.cc // Copyright 2018 Johns Hopkins University (author: Daniel Povey) @@ -17,7 +17,7 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#include "transform/differentiable-fmllr.h" +#include "adapt/differentiable-fmllr.h" #include "matrix/sp-matrix.h" namespace kaldi { diff --git a/src/adapt/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc index 3aa1df8e829..6525e573b17 100644 --- a/src/adapt/differentiable-fmllr.cc +++ b/src/adapt/differentiable-fmllr.cc @@ -1,4 +1,4 @@ -// transform/differentiable-fmllr.cc +// adapt/differentiable-fmllr.cc // Copyright 2018 Johns Hopkins University @@ -17,7 +17,7 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#include "transform/differentiable-fmllr.h" +#include "adapt/differentiable-fmllr.h" #include "matrix/matrix-functions.h" namespace kaldi { diff --git a/src/adapt/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h index df086109f6e..61bdbf81f92 100644 --- a/src/adapt/differentiable-fmllr.h +++ b/src/adapt/differentiable-fmllr.h @@ -1,4 +1,4 @@ -// transform/differentiable-fmllr.h +// adapt/differentiable-fmllr.h // Copyright 2018 Johns Hopkins University (author: Daniel Povey) diff --git a/src/adapt/differentiable-transform.h b/src/adapt/differentiable-transform.h index d3799c9e856..98fcaf11086 100644 --- a/src/adapt/differentiable-transform.h +++ b/src/adapt/differentiable-transform.h @@ -1,4 +1,4 @@ -// transform/differentiable-transform.h +// adapt/differentiable-transform.h // Copyright 2018 Johns Hopkins University (author: Daniel Povey) From 2d500442ca85d63565d7fe2ef4e743900e06c085 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 10 Dec 2018 13:11:28 -0500 Subject: [PATCH 31/87] [src] Various changes, not tested --- src/Makefile | 3 +- src/adapt/Makefile | 3 +- src/adapt/differentiable-fmllr-test.cc | 2 +- src/adapt/differentiable-fmllr.cc | 136 +++-- src/adapt/differentiable-fmllr.h | 109 ++-- src/adapt/differentiable-transform-itf.cc | 101 ++++ src/adapt/differentiable-transform-itf.h | 367 ++++++++++++ src/adapt/differentiable-transform-test.cc | 660 ++++++++++++++++++++ src/adapt/differentiable-transform.cc | 530 +++++++++++++++++ src/adapt/differentiable-transform.h | 662 ++++----------------- src/adapt/generic-transform.h | 315 ++++++++++ src/base/io-funcs.h | 2 +- src/hmm/posterior.h | 30 + src/nnet3/nnet-parse-test.cc | 189 ------ src/nnet3/nnet-parse.cc | 373 ------------ src/nnet3/nnet-parse.h | 123 ---- src/transform/Makefile | 6 +- src/util/text-utils-test.cc | 190 ++++++ src/util/text-utils.cc | 248 ++++++++ src/util/text-utils.h | 92 +++ 20 files changed, 2822 insertions(+), 1319 deletions(-) create mode 100644 src/adapt/differentiable-transform-itf.cc create mode 100644 src/adapt/differentiable-transform-itf.h create mode 100644 src/adapt/differentiable-transform-test.cc create mode 100644 src/adapt/differentiable-transform.cc create mode 100644 src/adapt/generic-transform.h diff --git a/src/Makefile b/src/Makefile index 6dfd146e3d5..8ddd579a9a5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -6,7 +6,7 @@ SHELL := /bin/bash SUBDIRS = base matrix util feat tree gmm transform \ - fstext hmm lm decoder lat kws cudamatrix nnet \ + fstext hmm lm decoder lat kws cudamatrix adapt nnet \ bin fstbin gmmbin fgmmbin featbin \ nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \ ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin @@ -168,6 +168,7 @@ lm: base util matrix fstext decoder: base util matrix gmm hmm tree transform lat lat: base util hmm tree matrix cudamatrix: base util matrix +adapt: base util matrix hmm cudamatrix nnet: base util hmm tree matrix cudamatrix nnet2: base util matrix lat gmm hmm tree transform cudamatrix nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext diff --git a/src/adapt/Makefile b/src/adapt/Makefile index b7df9b00ce9..8c8f4204802 100644 --- a/src/adapt/Makefile +++ b/src/adapt/Makefile @@ -4,7 +4,8 @@ include ../kaldi.mk TESTFILES = differentiable-fmllr-test -OBJFILES = differentiable-fmllr.o +OBJFILES = differentiable-fmllr.o differentiable-transform-itf.o \ + generic-transform.o differentiable-transform.o LIBNAME = kaldi-adapt diff --git a/src/adapt/differentiable-fmllr-test.cc b/src/adapt/differentiable-fmllr-test.cc index 5977f7be5e8..6f001380608 100644 --- a/src/adapt/differentiable-fmllr-test.cc +++ b/src/adapt/differentiable-fmllr-test.cc @@ -205,7 +205,7 @@ void TestGaussianEstimatorDerivs(const MatrixBase &feats, var_derivs.SetRandn(); var_derivs.Add(0.2); // Nonzero mean makes the test easier to pass } - g->SetOutputDerivs(mean_derivs, var_derivs); + g->AddToOutputDerivs(mean_derivs, var_derivs); Matrix feats_deriv(feats.NumRows(), feats.NumCols()); g->AccStatsBackward(feats, post, &feats_deriv); diff --git a/src/adapt/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc index 6525e573b17..f19b7c00e51 100644 --- a/src/adapt/differentiable-fmllr.cc +++ b/src/adapt/differentiable-fmllr.cc @@ -1,6 +1,6 @@ // adapt/differentiable-fmllr.cc -// Copyright 2018 Johns Hopkins University +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -23,6 +23,48 @@ namespace kaldi { namespace differentiable_transform { + +void FmllrEstimatorOptions::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, singular_value_relative_floor); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, variance_floor); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, variance_sharing_weight); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, smoothing_count); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, smoothing_between_class_factor); + WriteToken(os, binary, ""); +} + +void FmllrEstimatorOptions::Read(std::istream &is, bool binary) { + ExpectToken(is, binary, ""); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &singular_value_relative_floor); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &variance_floor); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &variance_sharing_weight); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &smoothing_count); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &smoothing_between_class_factor); + ExpectToken(is, binary, ""); +} + +void FmllrEstimatorOptions::ReadFromConfig(ConfigLine *config_line) { + config_line->GetValue("singular-value-relative-floor", + &singular_value_relative_floor); + config_line->GetValue("variance-floor", &variance_floor); + config_line->GetValue("variance-sharing-weight", &variance_sharing_weight); + config_line->GetValue("smoothing-count", &smoothing_count); + config_line->GetValue("smoothing-between-class-factor", + &smoothing_between_class_factor); +} + + CoreFmllrEstimator::CoreFmllrEstimator( const FmllrEstimatorOptions &opts, BaseFloat gamma, @@ -200,14 +242,13 @@ GaussianEstimator::GaussianEstimator(int32 num_classes, int32 feature_dim): } void GaussianEstimator::AccStats(const MatrixBase &feats, - const Posterior &post) { + const SubPosterior &post) { KALDI_ASSERT(static_cast(post.size()) == feats.NumRows()); int32 T = feats.NumRows(), num_classes = m_.NumRows(); - auto iter = post.begin(); - for (int32 t = 0; t < T; t++,++iter) { + for (int32 t = 0; t < T; t++) { SubVector feat(feats, t); - const std::vector > this_post = *iter; + const std::vector > &this_post = post[t]; auto iter2 = this_post.begin(), end2 = this_post.end(); for (; iter2 != end2; ++iter2) { @@ -264,7 +305,7 @@ void GaussianEstimator::Estimate(const FmllrEstimatorOptions &opts) { v_.Resize(0); } -void GaussianEstimator::SetOutputDerivs( +void GaussianEstimator::AddToOutputDerivs( const MatrixBase &mean_derivs, const VectorBase &var_derivs) { KALDI_ASSERT(SameDim(mean_derivs, mu_) && @@ -275,8 +316,11 @@ void GaussianEstimator::SetOutputDerivs( variance_floor = variance_floor_, gamma = gamma_.Sum(); KALDI_ASSERT(gamma > 0.0); - m_bar_.Resize(num_classes, dim); - v_bar_.Resize(num_classes, kUndefined); + if (m_bar_.NumRows() == 0) { + // This is the first time this function was called. + m_bar_.Resize(num_classes, dim); + v_bar_.Resize(num_classes); + } const VectorBase &t_bar(var_derivs); const MatrixBase &mu_bar(mean_derivs); @@ -284,14 +328,12 @@ void GaussianEstimator::SetOutputDerivs( for (int32 i = 0; i < num_classes; i++) { SubVector m_bar_i(m_bar_, i); BaseFloat gamma_i = gamma_(i); - if (gamma_i == 0.0 || s_(i) == variance_floor) { - v_bar_(i) = 0.0; - } else { - BaseFloat s_bar_i = (BaseFloat(1.0) - f) * t_bar(i) + s_bar * gamma_i / gamma; - v_bar_(i) = s_bar_i / gamma_i; - m_bar_i.AddVec(-2.0 * s_bar_i / gamma_i, mu_.Row(i)); - } if (gamma_i != 0.0) { + if (s_(i) != variance_floor) { + BaseFloat s_bar_i = (BaseFloat(1.0) - f) * t_bar(i) + s_bar * gamma_i / gamma; + v_bar_(i) += s_bar_i / gamma_i; + m_bar_i.AddVec(-2.0 * s_bar_i / gamma_i, mu_.Row(i)); + } m_bar_i.AddVec(1.0 / gamma_i, mu_bar.Row(i)); } } @@ -304,7 +346,7 @@ int32 GaussianEstimator::Dim() const { void GaussianEstimator::AccStatsBackward( const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, const MatrixBase *feats_deriv) { // The equation we're implementing is: // \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t) @@ -313,11 +355,10 @@ void GaussianEstimator::AccStatsBackward( int32 T = feats.NumRows(); KALDI_ASSERT(static_cast(post.size() == T) && SameDim(feats, *feats_deriv)); - auto iter = post.begin(); - for (int32 t = 0; t < T; t++,iter++) { + for (int32 t = 0; t < T; t++) { SubVector feat(feats, t), feat_deriv(*feats_deriv, t); - const std::vector > this_post = *iter; + const std::vector > &this_post = post[t]; auto iter2 = this_post.begin(), end2 = this_post.end(); for (; iter2 != end2; ++iter2) { @@ -330,6 +371,23 @@ void GaussianEstimator::AccStatsBackward( } } +void GaussianEstimator::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + mu_.Write(os, binary); + WriteToken(os, binary, ""); + t_.Write(os, binary); + WriteToken(os, binary, ""); +} + +void GaussianEstimator::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + mu_.Read(is, binary); + ExpectToken(is, binary, ""); + t_.Read(is, binary); + ExpectToken(is, binary, ""); +} + FmllrEstimator::FmllrEstimator(const FmllrEstimatorOptions &opts, const MatrixBase &mu, @@ -339,12 +397,12 @@ FmllrEstimator::FmllrEstimator(const FmllrEstimatorOptions &opts, opts_.Check(); gamma_.Resize(num_classes); - G_.Resize(dim, dim); + raw_G_.Resize(dim, dim); z_.Resize(num_classes, dim); } void FmllrEstimator::AccStats(const MatrixBase &feats, - const Posterior &post) { + const SubPosterior &post) { KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); int32 num_classes = mu_.NumRows(), dim = mu_.NumCols(), @@ -382,19 +440,12 @@ void FmllrEstimator::AccStats(const MatrixBase &feats, SubVector gamma_hat_t_part(gamma_hat_t, offset, n_frames); // the 0.0 value for beta means we don't double-count stats. G.AddMat2Vec(1.0, feats_part, kTrans, gamma_hat_t_part, 0.0); - G_.AddSp(1.0, G); + raw_G_.AddSp(1.0, G); } } BaseFloat FmllrEstimator::Estimate() { - // If at some point you need to create a version of Estimate() that can be - // called multiple times (e.g. for online applications), it will likely be - // easiest to create a 'const' version of Estimate() that outputs A and b via - // pointers. This one modifies the G_ and K_ quantities, which is what makes - // it tricky to do correctly if called twice. - if (A_.NumRows() != 0) - KALDI_ERR << "You cannot call Estimate() twice."; int32 dim = mu_.NumCols(); BaseFloat gamma_tot = gamma_.Sum(); KALDI_ASSERT(gamma_tot > 0.0 && @@ -426,8 +477,9 @@ BaseFloat FmllrEstimator::Estimate() { K_.AddVecVec(-gamma_hat_tot_, m_, n_); } - // In AccStats(), we did G := \sum_t \hat{\gamma}_t x_t x_t^T. - // Now we do: G -= \hat{\gamma} n n^T + // In AccStats(), we did raw_G := \sum_t \hat{\gamma}_t x_t x_t^T. + // Now we do: G = raw_G - \hat{\gamma} n n^T + G_ = raw_G_; G_.AddVecVec(-gamma_hat_tot_, n_, n_); KALDI_ASSERT(G_.IsSymmetric(0.0001)); @@ -471,7 +523,9 @@ BaseFloat FmllrEstimator::Estimate() { return A_impr + b_impr; } - +bool FmllrEstimator::IsEstimated() const { + return A_.NumRows() != 0; +} void FmllrEstimator::AdaptFeatures(const MatrixBase &feats, MatrixBase *adapted_feats) const { @@ -615,7 +669,7 @@ void FmllrEstimator::EstimateBackward() { void FmllrEstimator::AccStatsBackward( const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, MatrixBase *feats_deriv) { KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); int32 T = feats.NumRows(), num_classes = mu_.NumRows(); @@ -658,7 +712,7 @@ void FmllrEstimator::AccStatsBackward( BaseFloat FmllrEstimator::ForwardCombined( const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, MatrixBase *adapted_feats) { AccStats(feats, post); BaseFloat ans = Estimate(); @@ -668,7 +722,7 @@ BaseFloat FmllrEstimator::ForwardCombined( void FmllrEstimator::BackwardCombined( const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, const MatrixBase &adapted_feats_deriv, MatrixBase *feats_deriv) { AdaptFeaturesBackward(feats, adapted_feats_deriv, feats_deriv); @@ -690,7 +744,7 @@ MeanOnlyTransformEstimator::MeanOnlyTransformEstimator( } void MeanOnlyTransformEstimator::AccStats(const MatrixBase &feats, - const Posterior &post) { + const SubPosterior &post) { int32 T = feats.NumRows(), num_classes = mu_.NumRows(); KALDI_ASSERT(static_cast(post.size()) == T); @@ -728,6 +782,10 @@ void MeanOnlyTransformEstimator::Estimate() { output_deriv_sum_.Resize(dim); } +bool MeanOnlyTransformEstimator::IsEstimated() const { + return offset_.Dim() != 0; +} + void MeanOnlyTransformEstimator::AdaptFeatures( const MatrixBase &feats, MatrixBase *adapted_feats) const { @@ -762,7 +820,7 @@ void MeanOnlyTransformEstimator::EstimateBackward() { void MeanOnlyTransformEstimator::AccStatsBackward( const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, MatrixBase *feats_deriv) { int32 T = feats.NumRows(); @@ -781,7 +839,7 @@ void MeanOnlyTransformEstimator::AccStatsBackward( void MeanOnlyTransformEstimator::ForwardCombined( const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, MatrixBase *adapted_feats) { AccStats(feats, post); Estimate(); @@ -790,7 +848,7 @@ void MeanOnlyTransformEstimator::ForwardCombined( void MeanOnlyTransformEstimator::BackwardCombined( const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, const MatrixBase &adapted_feats_deriv, MatrixBase *feats_deriv) { AdaptFeaturesBackward(feats, adapted_feats_deriv, feats_deriv); diff --git a/src/adapt/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h index 61bdbf81f92..a1a7b22d451 100644 --- a/src/adapt/differentiable-fmllr.h +++ b/src/adapt/differentiable-fmllr.h @@ -18,8 +18,8 @@ // limitations under the License. -#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ -#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ +#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_ +#define KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_ #include @@ -30,8 +30,6 @@ #include "matrix/matrix-functions.h" namespace kaldi { - - namespace differentiable_transform { @@ -47,6 +45,7 @@ namespace differentiable_transform { + /** With reference to the notation in http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf, @@ -86,11 +85,11 @@ struct FmllrEstimatorOptions { // when the amount of data is small. BaseFloat smoothing_count; - // A factor that says how large the assumed between-class covariance matrix is - // relative to the within-class covariance matrix. Should be >= 0. A smaller - // value will mean that the smoothing penalizes rotations of the space less; - // with zero, the smoothing only constrains the singular values of A, not - // its direction. + // A factor that says how large the assumed between-class covariance matrix + // is, relative to the within-class covariance matrix. Should be >= 0. In + // the limit as it approaches zero, the smoothing will only penalize scaling + // of the space, but not rotations. This is likely not a good thing, so a + // value greater than zero will probably be desired. BaseFloat smoothing_between_class_factor; FmllrEstimatorOptions(): @@ -108,6 +107,13 @@ struct FmllrEstimatorOptions { variance_sharing_weight >= 0.0 && variance_sharing_weight <= 1.0); } + + void Write(std::ostream &os, bool binary) const; + void Read(std::istream &is, bool binary); + + // This will set any options in this class that it can find in 'config_line'. + void ReadFromConfig(ConfigLine *config_line); + }; @@ -283,11 +289,11 @@ class GaussianEstimator { // // @param [in] feats The input features, of dimension // num-frames by feature-dimension - // @param [in] post The posteriors, which is a + // @param [in] post The posteriors, which can be thought of as a // vector > >. // Its size() must equal feats.NumRows(). void AccStats(const MatrixBase &feats, - const Posterior &post); + const SubPosterior &post); // You call this once after calling AccStats() one or more times. // It estimates the model means and variances. @@ -297,7 +303,7 @@ class GaussianEstimator { // Returns true if Estimate() has previously been called, i.e. if // the means and variances have been computed. - bool IsEstimated(); + bool IsEstimated() const; // Returns the means, in a matrix of dimension num_classes by dim. Must not // be called if ! IsEstimated(). @@ -309,20 +315,19 @@ class GaussianEstimator { // this vector. const VectorBase &GetVars() const { return t_; } - // You call this to set the derivatives df/dmeans and df/dvars-- - // the derivatives of the objective function f w.r.t. those quantities. - // Doing this allows you to backprop through the estimation of the - // means and variances, back to the features. - // This must only be called after previously calling Estimate(). - // This function writes to v_bar_ and m_bar_. - void SetOutputDerivs(const MatrixBase &mean_derivs, - const VectorBase &var_derivs); + // You call this to add something the derivatives df/dmeans and df/dvars-- the + // derivatives of the objective function f w.r.t. those quantities. You might + // call this once or several times. Doing this allows you to backprop through + // the estimation of the means and variances, back to the features. This must + // only be called after previously calling Estimate(). This function writes + // to v_bar_ and m_bar_. + void AddToOutputDerivs(const MatrixBase &mean_derivs, + const VectorBase &var_derivs); - // This function, which must only be called after SetOutputDerivs() has - // been called, propagates the derivative back to the features. For - // purposes of this backpropagation, the posteriors are treated as - // constants. + // This function, which must only be called after AddToOutputDerivs() has been + // called at least once, propagates the derivative back to the features. For + // purposes of this backpropagation, the posteriors are treated as constants. // @param [in] feats The features, which must be the same // as you provided to one of the calls to // AccStats(). dimension is num-frames by @@ -335,8 +340,15 @@ class GaussianEstimator { // so it must have a well-defined value on // entry. void AccStatsBackward(const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, const MatrixBase *feats_deriv); + + + // Note: the Write() and Read() functions are only designed to write the means + // mu_ and the smoothed variances t_. We'll later modify them to (maybe + // conditionally) write other things if needed. + void Write(std::ostream &os, bool binary) const; + void Read(std::istream &is, bool binary); private: /* Notes on implementation of GaussianEstimator. @@ -504,16 +516,22 @@ class FmllrEstimator { i is the class label and p is the soft-count. */ void AccStats(const MatrixBase &feats, - const Posterior &post); + const SubPosterior &post); /** Estimate the fMLLR transform parameters A and b. Returns the objective-function improvement compared with A = I, b = 0, divided by the total count as returned by TotalCount(). + + You are allowed to call this multiple times (e.g. call AccStats(), call + Estimate(), call AccStats(), call Estimate() again). */ BaseFloat Estimate(); + // Return true if Estimate() has previously been called. + bool IsEstimated() const; + /// Returns the total count of the posteriors accumulated so far. BaseFloat TotalCount() { return gamma_.Sum(); } @@ -588,7 +606,7 @@ class FmllrEstimator { previously been added by AdaptFeaturesBackward(). */ void AccStatsBackward(const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, MatrixBase *feats_deriv); /** @@ -602,7 +620,7 @@ class FmllrEstimator { NaNs at entry. */ BaseFloat ForwardCombined(const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, MatrixBase *adapted_feats); /** Combines AdaptFeaturesBackward(), EstimateBackward(), and @@ -610,7 +628,7 @@ class FmllrEstimator { Note: 'feats_deriv' is *added* to so must be defined at entry. */ void BackwardCombined(const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, const MatrixBase &adapted_feats_deriv, MatrixBase *feats_deriv); @@ -635,10 +653,9 @@ class FmllrEstimator { // \gamma_i = \sum_t gamma_{t,i} Vector gamma_; - // This contains - // G = (\sum_t \hat{\gamma}_t x_t x_t^T ) - \hat{\gamma} n n^T. - // Before Estimate() is called, it won't contain the 2nd term, only the first. - Matrix G_; + // This contains one term in G_, namely: + // (\sum_t \hat{\gamma}_t x_t x_t^T ) + Matrix raw_G_; // This is of dimension num_classes by dim (same as mu_). It contains // the weighted sums of the input data, for each class: @@ -648,6 +665,14 @@ class FmllrEstimator { /////////// Quantities that are computed when Estimate() is called //////// + + // This contains + // G = (\sum_t \hat{\gamma}_t x_t x_t^T ) - \hat{\gamma} n n^T. + // It is computed as raw_G_ - \hat{\gamma} n n^T. + // We use two separate variables to make it easier to call Estimate() + // more than once without things getting confused. + Matrix G_; + // gamma_hat_tot_ is the total of gamma_(i) / s_(i), i.e. // \hat{\gamma} = \sum_i gamma_i / s_i. BaseFloat gamma_hat_tot_; @@ -814,14 +839,18 @@ class MeanOnlyTransformEstimator { i is the class label and p is the soft-count. */ void AccStats(const MatrixBase &feats, - const Posterior &post); + const SubPosterior &post); /** - Estimate the parameter (the offset b). Requires the total count to be - nonzero. + Estimate the parameter (the offset). Requires the total count to be + nonzero. You are allowed to call this multiple times (e.g. call + AccStats(), call Estimate(), call AccStats(), call Estimate() again). */ void Estimate(); + // Returns true if Estimate() has previously been called. + bool IsEstimated() const; + BaseFloat TotalCount() { return gamma_.Sum(); } /// Return the bias term b. @@ -879,7 +908,7 @@ class MeanOnlyTransformEstimator { the derivative w.r.t. the offset b. */ void AccStatsBackward(const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, MatrixBase *feats_deriv); @@ -893,7 +922,7 @@ class MeanOnlyTransformEstimator { NaNs at entry. */ void ForwardCombined(const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, MatrixBase *adapted_feats); /** Combines AdaptFeaturesBackward(), EstimateBackward(), and @@ -901,7 +930,7 @@ class MeanOnlyTransformEstimator { Note: 'feats_deriv' is *added* to so must be defined at entry. */ void BackwardCombined(const MatrixBase &feats, - const Posterior &post, + const SubPosterior &post, const MatrixBase &adapted_feats_deriv, MatrixBase *feats_deriv); @@ -934,4 +963,4 @@ class MeanOnlyTransformEstimator { } // namespace differentiable_transform } // namespace kaldi -#endif // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ +#endif // KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_ diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc new file mode 100644 index 00000000000..7c467cb8394 --- /dev/null +++ b/src/adapt/differentiable-transform-itf.cc @@ -0,0 +1,101 @@ +// adapt/differentiable-transform-itf.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-transform-itf.h" +#include "adapt/generic-transform.h" + +namespace kaldi { +namespace differentiable_transform { + + +// static +DifferentiableTransform* DifferentiableTransform::ReadNew( + std::istream &is, bool binary) { + + std::string token; + ReadToken(is, binary, &token); // e.g. "" + token.erase(0, 1); // erase "<". + token.erase(token.length()-1); // erase ">". + DifferentiableTransform *ans = NewTransformOfType(token); + if (!ans) + KALDI_ERR << "Unknown DifferentialbeTransform type " << token + << " (maybe you should recompile?)"; + ans->Read(is, binary); + return ans; +} + +// static +DifferentiableTransform* DifferentiableTransform::NewTransformOfType( + const std::string &type) { + if (type == "NoOpTransform") { + return new NoOpTransform(); + } else { + // Calling code will throw an error. + return NULL; + } +} + + +void DifferentiableTransform::TestingForwardBatch( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) { + int32 dim = input.NumCols(), + num_frames = input.NumRows(), + chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + // Just copy to CPU for now. + Matrix input_cpu(input); + Matrix output_cpu(num_frames, dim, kUndefined); + + for (int32 s = 0; s < num_spk; s++) { + SpeakerStatsItf *stats = this->GetEmptySpeakerStats(); + for (int32 chunk = s * chunks_per_spk; + chunk < (s + 1) * chunks_per_spk; chunk++) { + SubMatrix this_input(input_cpu.RowData(chunk), + frames_per_chunk, dim, + input_cpu.Stride() * num_chunks); + SubPosterior this_posteriors(posteriors, + chunk, // offset + frames_per_chunk, // num_frames + num_chunks); // stride + this->TestingAccumulate(this_input, this_posteriors, stats); + } + stats->Estimate(); + for (int32 chunk = s * chunks_per_spk; + chunk < (s + 1) * chunks_per_spk; chunk++) { + SubMatrix this_input(input_cpu.RowData(chunk), + frames_per_chunk, dim, + input_cpu.Stride() * num_chunks), + this_output(output_cpu.RowData(chunk), + frames_per_chunk, dim, + output_cpu.Stride() * num_chunks); + this->TestingForward(this_input, *stats, &this_output); + } + delete stats; + } + output->CopyFromMat(output_cpu); +} + + +} // namespace differentiable_transform +} // namespace kaldi diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h new file mode 100644 index 00000000000..b3595434458 --- /dev/null +++ b/src/adapt/differentiable-transform-itf.h @@ -0,0 +1,367 @@ +// adapt/differentiable-transform-itf.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_ITF_H_ +#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_ITF_H_ + +#include +#include "base/kaldi-common.h" +#include "matrix/kaldi-matrix.h" +#include "cudamatrix/cu-matrix.h" +#include "util/text-utils.h" +#include "hmm/posterior.h" + + +namespace kaldi { +namespace differentiable_transform { + +class MinibatchInfoItf { + public: + virtual ~MinibatchInfoItf() { } +}; + + +class SpeakerStatsItf { + public: + // Does any estimation that is required-- you call this after accumulating + // stats and before calling TestingForward(). + virtual void Estimate(); + + virtual ~SpeakerStatsItf() { } +}; + + +/** + This class is for speaker-dependent feature-space transformations -- + principally various varieties of fMLLR, including mean-only, diagonal and + block-diagonal versions -- which are intended for placement in the bottleneck + of a neural net. So code-wise, we'd have: bottom neural net, then transform, + then top neural net. The transform is designed to be differentiable, i.e. it + can be used during training to propagate derivatives from the top neural net + down to the bottom neural net. The reason this is non-trivial (i.e. why it's + not just a matrix multiplication) is that the value of the transform itself + depends on the features, and also on the speaker-independent statistics for + each class (i.e. the mean and variance), which also depend on the features + sicne we estimate them from the same minibatch. + You can view this as an extension of things like BatchNorm, except the + interface is more complicated because there is a dependence on the per-frame + class labels. + + The class labels we'll use here will probably be derived from some kind of + minimal tree, with hundreds instead of thousands of states. Part of the + reason for using a smaller number of states is that, to make the thing + properly differentiable during training, we need to use a small enough number + of states that we can obtain a reasonable estimate for the mean and (spherical) + variance of a Gaussian for each one in training time. Anyway, as you can see in + http://isl.anthropomatik.kit.edu/pdf/Nguyen2017.pdf, it's generally better + for this kind of thing to use "simple target models" for adaptation rather than + very complex models. + + Note: for training utterances we'll generally get the class labels used for + adatpation in a supervised manner, either by aligning a previous system like + a GMM system, or-- more likely-- from the (soft) posteriors of the the + numerator graphs. In test time, we'll usually be getting these class labels + from some kind of unsupervised process. + + Because we tend to train neural nets on fairly small fixed-size chunks + (e.g. 1.5 seconds), and transforms like fMLLR don't tend to work very well + until you have about 5 seconds of data, we will usually be arranging those + chunks into groups where all members of the group come from the same + speaker. So, for instance, instead of 128 totally separate chunks, we might + have 4 chunks per speaker and 32 speakers. + + The basic pattern of usage of class DifferentiableTransform is this: + + - Initialize the object prior to training, e.g. with InitFromConfig(). + + - Use this object to jointly train the 'bottom' (feature-extracting) and + 'top' (ASR) network. This involves functions TrainingForward() and + TrainingBackward() of this object; the posteriors used for that might be + dumped with the 'egs' (e.g. come from a GMM system), or might be derived + from the alignment of the numerator lattices in chain training. Any + class means that must be estimated, would be estimated on each minibatch + (we'll try to keep the minibatches as large as possible, and may use + tricks like using bigger minibatch sizes for the bottom + (feature-extracting) network and smaller ones for the top one, to save + memory. At this stage, this object will most likely only contain + configuration information and not any kind of data-dependent statistics. + + - Use some reasonable-sized subset of training data to accumulate more + reliable statistics for the target model using Accumulate() followed + by Estimate(). If NumFinalIterations() is more than one you may need + do this in a short loop. + + - In test time, for each speaker you'll: + - call GetEmptySpeakerStats() to get an object to store adaptation statistics + for your speaker. + - Obtain some class-level posteriors somehow (could come from an initial + decoding pass on all the data, or from the final decoding pass on the + part of the data you've seen up till now). Use these to call + TestingAccumulate() to accumulate speaker stats. + - Call TestingForward() with the speaker-stats object to get + adapted features. + + + */ +class DifferentiableTransform { + public: + + /// Return the dimension of the features this operates on. + virtual int32 Dim() const = 0; + + /// Return the number of classes in the model used for adaptation. These + /// will probably correspond to the leaves of a small tree, so they would + /// be pdf-ids. This model only keeps track of the number of classes, + /// it does not contain any information about what they mean. The + /// integers in the objects of type Posterior provided to this class + /// are expected to contain numbers from 0 to NumClasses() - 1. + int32 NumClasses() const { return num_classes_; } + + + /// This can be used to change the number of classes. It would normally be + /// used, if at all, after the model is trained and prior to calling + /// Accumulate(), in case you want to use a more detailed model (e.g. the + /// normal-size tree instead of the small one that we use during training). + /// Child classes may want to override this, in case they need to do + /// something more than just set this variable. + virtual void SetNumClasses(int32 num_classes) { num_classes_ = num_classes; } + + /** + This is the function you call in training time, for the forward + pass; it adapts the features. By "training time" here, we + assume you are training the 'bottom' neural net, that produces + the features in 'input'; if you were not training it, it would + be the same as test time as far as this function is concerned. + + @param [in] input The original, un-adapted features; these + will typically be output by a neural net, the 'bottom' net in our + terminology. This will correspond to a whole minibatch, + consisting of multiple speakers and multiple sequences (chunks) + per speaker. Caution: the order of both the input and + output features, and the posteriors, does not consist of blocks, + one per sequence, but rather blocks, one per time frame, so the + sequences are intercalated. This is the default order in + nnet3; see operator < of nnet3::Index. + @param [in] num_chunks The number of individual sequences + (e.g., chunks of speech) represented in 'input'. + input.NumRows() will equal num_sequences times the number + of time frames. + @param [in] num_spk The number of speakers. Must be greater than one, and + must divide num_chunks. The number of chunks per speaker + (num_chunks / num_spk) must be the same for all speakers, and the + chunks for a speaker must be consecutive. + @param [in] posteriors (note: this is a vector of vector of + pair). This provides, in 'soft-count' + form, the class supervision information that is used for the + adaptation. posteriors.size() will be equal to input.NumRows(), + and the ordering of its elements is the same as the ordering + of the rows of input, i.e. the sequences are intercalated. + There is no assumption that the posteriors sum to one; + this allows you to do things like silence weighting. + @param [out] output The adapted output. This matrix should have the + same dimensions as 'input'. It does not have to be free of + NaNs when you call this function. + @return This function returns either NULL or an object of type + DifferentiableTransformItf*, which is expected to later be given + to the function TrainingBackward(). It will store + any information that needs to be remembered for the backward + phase. + */ + virtual MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const = 0; + + /** + This does the backpropagation, during the training pass. + + @param [in] input The original input (pre-transform) features that + were given to TrainingForward(). + @param [in] output_deriv The derivative of the objective function + (that we are backpropagating) w.r.t. the output. + @param [in] num_chunks,num_spk,posteriors + See TrainingForward() for information + about these arguments; they should be the same + values. + @param [in] minibatch_info The pointer returned by the corresponding + call to TrainingForward() (may be NULL). This function + takes possession of the pointer. If for some reason the + backward pass was not done, the caller will likely + want to delete it themselves. + @param [in,out] input_deriv The derivative at the input, i.e. + dF/d(input), where F is the function we are + evaluating. Must have the same dimension as + 'input'. The derivative is *added* to here. + This is useful because generally we will also + be training (perhaps with less weight) on + the unadapted features, in order to prevent them + from deviating too far from the adapted ones + and to allow the same model to be used for the + first pass. + */ + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const = 0; + + + /** + Returns the number of times you have to (call Accumulate() on a subset + of data, then call Estimate()) + */ + virtual int32 NumFinalIterations() = 0; + + /** + This will typically be called sequentially, minibatch by minibatch, + for a subset of training data, after training the neural nets, + followed by a call to Estimate(). Accumulate() stores statistics + that are used by Estimate(). This process is analogous to + computing the final stats in BatchNorm, in preparation for testing. + In practice it will be doing things like computing per-class means + and variances. + + @param [in] final_iter An iteration number in the range + [0, NumFinalIterations()]. In many cases there will + be only one iteration so this will just be zero. + + The input parameters are the same as the same-named parameters to + TrainingForward(); please refer to the documentation there. + */ + virtual void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) = 0; + + // To be called after repeated calls to Accumulate(), does any estimation that + // is required in training time (normally per-speaker means and possibly + // variances. + // @param [in] final_iter An iteration number in the range + // [0, NumFinalIterations()]. In many cases there will + // be only one iteration so this will just be zero. + virtual void Estimate(int32 final_iter) = 0; + + // Returns an object representing sufficient statistics for estimating a + // speaker-dependent transform. This object will initially have zero + // counts in its statistics. It will represent the stats for a single + // speaker. + virtual SpeakerStatsItf *GetEmptySpeakerStats() const = 0; + + + // Accumulate statistics for a segment of test data, storing them in the + // object 'speaker_stats'. There is no assumption that the soft-counts in + // 'posteriors' are positive; this allows you to change your mind about the + // traceback, in test-time, by subtracting the stats that you no longer want + // to use. + virtual void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const = 0; + + // Applies the transformation implied by the statistics in 'speaker_stats' to + // 'input', storing in the result in 'output'. You must have done any estimation + // procedure that is required first, by calling Estimate() on the speaker-stats + // object. 'output' may contain NaN's at entry. + virtual void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const = 0; + + // TestingForwardBatch() combines GetEmptySpeakerStats(), TestingAccumulate() and + // TestingForward(). It has a default implementation. It is a convenience + // function that may be useful during training under some circumstances, e.g. + // when you want to train only the top network. + virtual void TestingForwardBatch( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output); + + // Copies transform (deep copy). + virtual DifferentiableTransform* Copy() const = 0; + + // Return the type of this transform. E.g. "NoOpTransform". + virtual std::string Type() const = 0; + + /* + Initialize this object from the config line at position 'cur_pos' of the + vector 'config_lines'. This function may end up reading more lines than + one, if this is a transform type that contains other transforms. + + @param [in] cur_pos The starting position in config_lines; required + to be in the range [0, config_lines->size() - 1]. + The Type() of this object must match the first token + (function FirstToken()) of that ConfigLine. + @param [in,out] config_lines Config lines to be read. It's non-const + because the process of reading them has effects on + the lines themselves (the ConfigLine object keeps + track of which configuration values have been read). + @return Returns the next position to be read. Will be in the range + [cur_pos + 1, config_lines->size()]; if it's equal to + config_lines->size(), it means we're done. + */ + virtual int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines); + + // Returns a new transform of the given type e.g. "NoOpTransform" + // or NULL if no such component type exists. + static DifferentiableTransform *NewTransformOfType(const std::string &type); + + // Reads a differentiable transform from a config file (this function parses + // the file and reads a single DifferentiableTransform object from it). Note: + // since DifferentiableTransform objects can contain others, the file may + // contain many lines. + static DifferentiableTransform *ReadFromConfig(std::istream &is); + + // Write transform to stream + virtual void Write(std::ostream &os, bool binary) const = 0; + + // Reads transform from stream (normally you would previously have created + // the transform object of the correct type using ReadNew(). + virtual void Read(std::istream &is, bool binary) = 0; + + // Read transform from stream (works out its type). Dies on error. + // This will be used when reading in objects that have been written with + // the Write() function, since you won't know the type of the object + // beforehand. + static DifferentiableTransform* ReadNew(std::istream &is, bool binary); + + virtual ~DifferentiableTransform() { } + protected: + int32 num_classes_; +}; + +// Attempts to read a transform +DifferentiableTransform *ReadTransformAtPosition( + int32 pos, std::vector *config_lines); + + +} // namespace differentiable_transform +} // namespace kaldi + +#endif // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ diff --git a/src/adapt/differentiable-transform-test.cc b/src/adapt/differentiable-transform-test.cc new file mode 100644 index 00000000000..419d754d764 --- /dev/null +++ b/src/adapt/differentiable-transform-test.cc @@ -0,0 +1,660 @@ +// adapt/differentiable-transform-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-transform.h" +#include "matrix/sp-matrix.h" + +namespace kaldi { +namespace differentiable_transform { + +// This function writes a random configuration file of dimension +// 'dim' (or a random dimension if dim == -1) to 'os'. +void WriteRandomConfigOfDim(std::ostream &os, int32 dim) { + // nonrandom_dim is a randomly chosen dimension if dim == -1, + // else it's dim. + int32 actual_dim = (dim == -1 ? RandInt(10, 20) : dim); + int32 i, num_transforms = RandInt(1, 3); + + while (true) { + // we loop here in case we hit a case we don't want to handle. + // We give more cases to the non-recursive transforms to ensure + // the expected size of the config file is finite. + switch(RandInt(0, 7)) { + case 0: case 1: + os << "NoOpTransform dim=" << actual_dim << "\n"; + return; + case 2: case 3: + os << "FmllrTransform dim=" << actual_dim << "\n"; + return; + case 4: case 5: + os << "MeanOnlyTransform dim=" << actual_dim << "\n"; + return; + case 6: + if (dim != -1) // complicated to ensure a given dim for AppendTransform. + continue; + os << "AppendTransform num-transforms=" << num_transforms << "\n"; + for (i = 0; i < num_transforms; i++) + WriteRandomConfigOfDim(os, -1); + return; + case 7: + os << "SequenceTransform num-transforms=" << num_transforms << "\n"; + for (i = 0; i < num_transforms; i++) + WriteRandomConfigOfDim(os, actual_dim); + return; + } + } + +} + +// This function writes a random configuration file to 'os'. +void WriteRandomConfigFile(std::ostream &os) { + WriteRandomConfigOfDim(std::ostream &os, -1); +} + + + + + +// Test derivatives produced by the Estimator object for K. +void TestCoreFmllrEstimatorKDeriv( + BaseFloat gamma, + const Matrix &G, + const Matrix &K, + const Matrix &A, + CoreFmllrEstimator *estimator) { + + int32 num_directions = 4; + Vector expected_changes(num_directions), + actual_changes(num_directions); + + int32 dim = G.NumRows(); + BaseFloat epsilon = 1.0e-03 * gamma; + Matrix A_deriv(dim, dim); + // A_deriv defines the objective function: a random linear function in A. + A_deriv.SetRandn(); + A_deriv.Add(0.1); // Introduce some asymmetry. + + Matrix G_deriv(dim, dim), + K_deriv(dim, dim); + estimator->Backward(A_deriv, &G_deriv, &K_deriv); + + for (int32 i = 0; i < num_directions; i++) { + Matrix K_new(dim, dim); + K_new.SetRandn(); + K_new.Scale(epsilon); + expected_changes(i) = TraceMatMat(K_new, K_deriv, kTrans); + K_new.AddMat(1.0, K); + FmllrEstimatorOptions opts; + Matrix A_new(dim, dim); + CoreFmllrEstimator estimator2(opts, gamma, G, K_new, &A_new); + estimator2.Forward(); + A_new.AddMat(-1.0, A); + // compute the change in our random linear objective function defined by + // A_deriv, that would be produced by taking some small random change in K + // and computing the A that results from that. + actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); + } + + KALDI_LOG << "Expected changes: " << expected_changes + << ", actual changes: " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + +// Test derivatives produced by the Estimator object for G. +void TestCoreFmllrEstimatorGDeriv( + BaseFloat gamma, + const Matrix &G, + const Matrix &K, + const Matrix &A, + CoreFmllrEstimator *estimator) { + + int32 num_directions = 4; + Vector expected_changes(num_directions), + actual_changes(num_directions); + + int32 dim = G.NumRows(); + BaseFloat epsilon = 1.0e-03 * gamma; + Matrix A_deriv(dim, dim); + // A_deriv defines the objective function: a random linear function in A. + A_deriv.SetRandn(); + A_deriv.Add(0.1); // Introduce some asymmetry. + + Matrix G_deriv(dim, dim), + K_deriv(dim, dim); + estimator->Backward(A_deriv, &G_deriv, &K_deriv); + + KALDI_ASSERT(G_deriv.IsSymmetric()); + + for (int32 i = 0; i < num_directions; i++) { + Matrix G_new(dim, dim); + { + SpMatrix s(dim); + s.SetRandn(); + G_new.CopyFromSp(s); + } + G_new.Scale(epsilon); + expected_changes(i) = TraceMatMat(G_new, G_deriv, kTrans); + G_new.AddMat(1.0, G); + FmllrEstimatorOptions opts; + Matrix A_new(dim, dim); + CoreFmllrEstimator estimator2(opts, gamma, G_new, K, &A_new); + estimator2.Forward(); + A_new.AddMat(-1.0, A); + // compute the change in our random linear objective function defined by + // A_deriv, that would be produced by taking some small random change in K + // and computing the A that results from that. + actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); + } + + KALDI_LOG << "Expected changes: " << expected_changes + << ", actual changes: " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + + +void UnitTestCoreFmllrEstimatorSimple() { + int32 dim = RandInt(10, 20); + BaseFloat gamma = RandInt(5, 10); + Matrix G(dim, dim), + K(dim, dim), A(dim, dim, kUndefined); + G.AddToDiag(1.234 * gamma); + K.AddToDiag(0.234 * gamma); + FmllrEstimatorOptions opts; + CoreFmllrEstimator estimator(opts, gamma, G, K, &A); + BaseFloat objf_impr = estimator.Forward(); + KALDI_LOG << "A is " << A; + KALDI_ASSERT(A.IsUnit(0.01)); + KALDI_ASSERT(fabs(objf_impr) < 0.01); + for (int32 i = 0; i < 5; i++) { + TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); + TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator); + } +} + +static void InitRandNonsingular(MatrixBase *M) { + do { + M->SetRandn(); + } while (M->Cond() > 50.0); +} + + +void UnitTestCoreFmllrEstimatorGeneral() { + int32 dim = RandInt(10, 20); + BaseFloat gamma = RandInt(5, 10); + Matrix G(dim, dim), + K(dim, dim), A(dim, dim, kUndefined); + + { + // make sure G is symmetric and +ve definite. + Matrix A(dim, dim + 10); + A.SetRandn(); + G.AddMatMat(gamma, A, kNoTrans, A, kTrans, 0.0); + } + + InitRandNonsingular(&K); + K.Scale(gamma); + FmllrEstimatorOptions opts; + CoreFmllrEstimator estimator(opts, gamma, G, K, &A); + BaseFloat objf_impr = estimator.Forward(); + KALDI_LOG << "A is " << A << ", objf impr is " << objf_impr; + for (int32 i = 0; i < 5; i++) { + TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); + TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator); + } +} + +void TestGaussianEstimatorDerivs(const MatrixBase &feats, + const Posterior &post, + const FmllrEstimatorOptions &opts, + GaussianEstimator *g) { + int32 n = 4; // number of delta-params we use. + Vector expected_changes(n), + actual_changes(n); + + // if !test_mean_deriv, then we test the var deriv. + bool test_mean_deriv = (RandInt(0, 1) == 0); + + int32 num_classes = g->NumClasses(), dim = g->Dim(); + + Matrix mean_derivs(num_classes, dim); + Vector var_derivs(num_classes); + if (test_mean_deriv) { + KALDI_LOG << "Testing mean derivs."; + mean_derivs.SetRandn(); + } else { + KALDI_LOG << "Testing var derivs."; + var_derivs.SetRandn(); + var_derivs.Add(0.2); // Nonzero mean makes the test easier to pass + } + g->AddToOutputDerivs(mean_derivs, var_derivs); + Matrix feats_deriv(feats.NumRows(), feats.NumCols()); + g->AccStatsBackward(feats, post, &feats_deriv); + + BaseFloat epsilon = 1.0e-04; + + for (int32 i = 0; i < n; i++) { + Matrix new_feats(feats.NumRows(), + feats.NumCols()); + new_feats.SetRandn(); + new_feats.Scale(epsilon); + + expected_changes(i) = TraceMatMat(feats_deriv, new_feats, kTrans); + + new_feats.AddMat(1.0, feats); + + GaussianEstimator g2(num_classes, dim); + g2.AccStats(new_feats, post); + g2.Estimate(opts); + + actual_changes(i) = + TraceMatMat(mean_derivs, g2.GetMeans(), kTrans) - + TraceMatMat(mean_derivs, g->GetMeans(), kTrans) + + VecVec(var_derivs, g2.GetVars()) - + VecVec(var_derivs, g->GetVars()); + } + KALDI_LOG << "Actual changes are " << actual_changes + << " vs. predicted " << expected_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + +void TestFmllrEstimatorMeanDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const MatrixBase &mu_deriv = f.GetMeanDeriv(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-04; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_mu(num_classes, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_mu.SetRandn(); + // adding a systematic component helps the test to succeed in low precision. + for (int32 c = 0; c < num_classes; c++) { + new_mu.Row(c).Add(0.1 * RandInt(-1, 1)); + } + new_mu.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); + new_mu.AddMat(1.0, mu); + FmllrEstimator f2(opts, new_mu, s); + f2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + +void TestFmllrEstimatorVarDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + // Adding a systematic component to the derivative makes the test easier + // to pass, as the derivs are less random. + adapted_feats_deriv.AddMat(0.1, feats); + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const VectorBase &s_deriv = f.GetVarDeriv(); + + // measure the accuracy of the deriv in 10 random directions + int32 n = 10; + BaseFloat epsilon = 0.01; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Vector new_s(num_classes, kUndefined); + Matrix new_adapted_feats(T, dim, kUndefined); + new_s.SetRandn(); + new_s.Scale(epsilon); + expected_changes(i) = VecVec(new_s, s_deriv); + new_s.AddVec(1.0, s); + FmllrEstimator f2(opts, mu, new_s); + f2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + + +void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + int32 T = feats.NumRows(), dim = feats.NumCols(); + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_feats(T, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_feats.SetRandn(); + new_feats.Add(RandGauss()); // will help to test whether the indirect + // part of the derivative is accurate. + new_feats.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); + new_feats.AddMat(1.0, feats); + FmllrEstimator f2(opts, mu, s); + f2.ForwardCombined(new_feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void TestMeanOnlyTransformEstimatorMeanDerivs( + const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + MeanOnlyTransformEstimator m(mu); + + Matrix adapted_feats(T, dim, kUndefined); + m.ForwardCombined(feats, post, &adapted_feats); + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const MatrixBase &mu_deriv = m.GetMeanDeriv(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_mu(num_classes, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_mu.SetRandn(); + // adding a systematic component helps the test to succeed in low precision. + for (int32 c = 0; c < num_classes; c++) { + new_mu.Row(c).Add(0.1 * RandInt(-1, 1)); + } + new_mu.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); + new_mu.AddMat(1.0, mu); + MeanOnlyTransformEstimator m2(new_mu); + m2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void TestMeanOnlyTransformEstimatorFeatDerivs( + const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + int32 T = feats.NumRows(), dim = feats.NumCols(); + const MatrixBase &mu(g.GetMeans()); + + + MeanOnlyTransformEstimator m(mu); + + Matrix adapted_feats(T, dim, kUndefined); + m.ForwardCombined(feats, post, &adapted_feats); + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_feats(T, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_feats.SetRandn(); + new_feats.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); + new_feats.AddMat(1.0, feats); + MeanOnlyTransformEstimator m2(mu); + m2.ForwardCombined(new_feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void UnitTestGaussianAndEstimators() { + // It's important that the number of classes be greater than the dimension, or + // we would get a low-rank K. + int32 num_classes = RandInt(30, 40), + dim = RandInt(10, 20), + num_frames = RandInt(20 * num_classes, 40 * num_classes); + + GaussianEstimator g(num_classes, dim); + + Matrix feats(num_frames, dim); + feats.SetRandn(); + feats.Add(0.2); // Nonzero offset tests certain aspects of the code better. + Posterior post(num_frames); + for (int32 t = 0; t < num_frames; t++) { + int32 n = RandInt(0, 2); + for (int32 j = 0; j < n; j++) { + int32 i = RandInt(0, num_classes - 1); + BaseFloat p = 0.25 * RandInt(1, 5); + post[t].push_back(std::pair(i, p)); + } + } + g.AccStats(feats, post); + FmllrEstimatorOptions opts; + // avoid setting variance_sharing_weight to 1.0; it's hard for the tests to + // succeed then, and there are valid reasons for that + opts.variance_sharing_weight = 0.25 * RandInt(0, 2); + g.Estimate(opts); + KALDI_LOG << "Means are: " + << g.GetMeans() << ", vars are: " + << g.GetVars(); + + TestGaussianEstimatorDerivs(feats, post, opts, &g); + + if (RandInt(0, 1) == 0) { + opts.smoothing_count = 500.0; + } + + { // test FmllrEstimator + TestFmllrEstimatorMeanDerivs(feats, post, g); + TestFmllrEstimatorFeatDerivs(feats, post, g); + TestFmllrEstimatorVarDerivs(feats, post, g); + } + + { // test MeanOnlyTransformEstimator. + TestMeanOnlyTransformEstimatorMeanDerivs(feats, post, g); + TestMeanOnlyTransformEstimatorFeatDerivs(feats, post, g); + } + + + + +} + + + +} // namespace kaldi +} // namespace differentiable_transform + + + +int main() { + using namespace kaldi::differentiable_transform; + + for (int32 i = 0; i < 50; i++) { + UnitTestCoreFmllrEstimatorSimple(); + UnitTestCoreFmllrEstimatorGeneral(); + UnitTestGaussianAndEstimators(); + } + std::cout << "Test OK.\n"; +} diff --git a/src/adapt/differentiable-transform.cc b/src/adapt/differentiable-transform.cc new file mode 100644 index 00000000000..e622effc9ea --- /dev/null +++ b/src/adapt/differentiable-transform.cc @@ -0,0 +1,530 @@ +// adapt/differentiable-transform.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-transform.h" + + +// This header contains the 'base-cases' of DifferentiableTransform: namely, +// FmllrTransform and MeanOnlyTransform. See also generic-transform.h where +// sequence, append and no-op types are defined. +namespace kaldi { +namespace differentiable_transform { + +FmllrMinibatchInfo::FmllrMinibatchInfo( + int32 num_classes, int32 dim, int32 num_speakers): + target_model(num_classes, dim), + estimators(num_speakers, NULL) { } + +FmllrMinibatchInfo::~FmllrMinibatchInfo() { + for (size_t i = 0; i < estimators.size(); i++) + delete estimators[i]; +} + + +int32 FmllrTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size())); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == "FmllrTransform"); + + if (!line->GetValue("dim", &dim_) || dim_ <= 0) + KALDI_ERR << "Dimension 'dim' must be specified for FmllrTransform, config " + "line is: " << line->WholeLine(); + fmllr_opts_.ReadFromConfig(line); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + return cur_pos + 1; +} + +void FmllrTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + fmllr_opts_.Write(os, binary); + if (target_model_ != NULL) { + WriteToken(os, binary, ""); + target_model_->Write(os, binary); + } else { + WriteToken(os, binary, ""); + } + WriteToken(os, binary, ""); +} + +void FmllrTransform::Read(std::istream &is, bool binary) { + delete target_model_; + target_model_ = NULL; + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dim_); + fmllr_opts_.Read(is, binary); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + target_model_ = new GaussianEstimator(num_classes_, dim_); + } // else "". + ExpectToken(is, binary, ""); +} + + +MinibatchInfoItf* FmllrTransform::TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + int32 num_classes = num_classes_, + dim = dim_, num_frames = input.NumRows(); + KALDI_ASSERT(SameDim(input, *output) && input.NumCols() == dim && + int32(posteriors.size()) == input.NumRows()); + KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 && + num_frames % num_chunks == 0); + int32 chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + FmllrMinibatchInfo *ans = new FmllrMinibatchInfo(num_classes, + dim, num_spk); + + // The input is in CuMatrix, i.e. it's on the GPU if we're using a GPU. For + // now we just transfer everything to CPU, which of course is not optimal; we + // may later implement some of the deeper parts of this on GPU if the methods + // turn out to be effective. + Matrix input_cpu(input), + output_cpu(num_frames, dim, kUndefined); + + // First estimate the target model (Gaussian means and spherical variances). + ans->target_model.AccStats(input_cpu, posteriors); + ans->target_model.Estimate(fmllr_opts_); + + for (int32 s = 0; s < num_spk; s++) + ans->estimators[s] = new FmllrEstimator(fmllr_opts_, + ans->target_model.GetMeans(), + ans->target_model.GetVars()); + + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input(input_cpu.RowData(chunk), + frames_per_chunk, // num-rows + dim, // num-cols + input_cpu.Stride() * num_chunks); // stride + SubPosterior this_posteriors(posteriors, + chunk, // offset + frames_per_chunk, // num_frames + num_chunks); // stride + ans->estimators[speaker]->AccStats(this_input, this_posteriors); + } + BaseFloat objf_impr = 0.0; + for (int32 s = 0; s < num_spk; s++) + objf_impr += ans->estimators[s]->Estimate() / num_spk; + // objf_impr is now the average objective-function improvement per frame. + // We will later find a better way to display this. + KALDI_LOG << "Objective function improvement per frame is " + << objf_impr; + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix + this_input(input_cpu.RowData(chunk), frames_per_chunk, dim, + input_cpu.Stride() * num_chunks), + this_output(output_cpu.RowData(chunk), + frames_per_chunk, dim, output_cpu.Stride() * num_chunks); + ans->estimators[speaker]->AdaptFeatures(this_input, &this_output); + } + output->CopyFromMat(output_cpu); + return ans; +} + +void FmllrTransform::TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const { + FmllrMinibatchInfo *info = dynamic_cast(minibatch_info); + KALDI_ASSERT(info != NULL && "Wrong type of minibatch info supplied."); + + int32 dim = dim_, num_frames = input.NumRows(); + KALDI_ASSERT(SameDim(input, output_deriv) && input.NumCols() == dim && + SameDim(input, *input_deriv) && + int32(posteriors.size()) == input.NumRows()); + KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 && + num_frames % num_chunks == 0); + int32 chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + // For now we just transfer everything to the CPU. + Matrix input_cpu(input), + output_deriv_cpu(output_deriv), + input_deriv_cpu(num_frames, dim); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input( + input_cpu.RowData(chunk), frames_per_chunk, + dim, input_cpu.Stride() * num_chunks), + this_output_deriv(output_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + output_deriv_cpu.Stride() * num_chunks), + this_input_deriv(input_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + input_deriv_cpu.Stride() * num_chunks); + info->estimators[speaker]->AdaptFeaturesBackward( + this_input, this_output_deriv, &this_input_deriv); + } + + for (int32 s = 0; s < num_spk; s++) + info->estimators[s]->EstimateBackward(); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input( + input_cpu.RowData(chunk), frames_per_chunk, + dim, input_cpu.Stride() * num_chunks), + this_output_deriv(output_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + output_deriv_cpu.Stride() * num_chunks), + this_input_deriv(input_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + input_deriv_cpu.Stride() * num_chunks); + SubPosterior this_posteriors(posteriors, chunk, + frames_per_chunk, num_chunks); + info->estimators[speaker]->AccStatsBackward( + this_input, this_posteriors, &this_input_deriv); + } + + for (int32 s = 0; s < num_spk; s++) + info->target_model.AddToOutputDerivs(info->estimators[s]->GetMeanDeriv(), + info->estimators[s]->GetVarDeriv()); + + info->target_model.AccStatsBackward(input_cpu, posteriors, &input_deriv_cpu); + input_deriv->CopyFromMat(input_deriv_cpu); + + delete info; +} + + +void FmllrTransform::Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) { + KALDI_ASSERT(final_iter == 0); + if (target_model_ == NULL) + target_model_ = new GaussianEstimator(num_classes_, dim_); + Matrix input_cpu(input); + target_model_->AccStats(input_cpu, posteriors); +} + +SpeakerStatsItf *FmllrTransform::GetEmptySpeakerStats() const { + KALDI_ASSERT(target_model_ != NULL && + target_model_->GetMeans().NumRows() != 0 && + "You're trying to do adaptation with speaker transforms on " + "which you haven't done the final phase of training."); + return new FmllrSpeakerStats(fmllr_opts_, target_model_->GetMeans(), + target_model_->GetVars()); +} + +void FmllrTransform::TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const { + FmllrSpeakerStats *stats = dynamic_cast( + speaker_stats); + KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied."); + stats->estimator.AccStats(input, posteriors); +} + +void FmllrTransform::TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const { + const FmllrSpeakerStats *stats = dynamic_cast( + &speaker_stats); + KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied."); + KALDI_ASSERT(stats->estimator.IsEstimated() && + "You can't call TestingForward() without calling Estimate() on " + "the speaker stats."); + stats->estimator.AdaptFeatures(input, output); +} + +FmllrTransform::~FmllrTransform() { + delete target_model_; +} + + + +MeanOnlyTransformMinibatchInfo::MeanOnlyTransformMinibatchInfo( + int32 num_classes, int32 dim, int32 num_speakers): + target_model(num_classes, dim), + estimators(num_speakers, NULL) { } + +MeanOnlyTransformMinibatchInfo::~MeanOnlyTransformMinibatchInfo() { + for (size_t i = 0; i < estimators.size(); i++) + delete estimators[i]; +} + + +int32 MeanOnlyTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size())); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == "MeanOnlyTransform"); + + if (!line->GetValue("dim", &dim_) || dim_ <= 0) + KALDI_ERR << "Dimension 'dim' must be specified for MeanOnlyTransform, config " + "line is: " << line->WholeLine(); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + return cur_pos + 1; +} + +void MeanOnlyTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + if (target_model_ != NULL) { + WriteToken(os, binary, ""); + target_model_->Write(os, binary); + } else { + WriteToken(os, binary, ""); + } + WriteToken(os, binary, ""); +} + +void MeanOnlyTransform::Read(std::istream &is, bool binary) { + delete target_model_; + target_model_ = NULL; + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dim_); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + target_model_ = new GaussianEstimator(num_classes_, dim_); + } // else "". + ExpectToken(is, binary, ""); +} + + +MinibatchInfoItf* MeanOnlyTransform::TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + int32 num_classes = num_classes_, + dim = dim_, num_frames = input.NumRows(); + KALDI_ASSERT(SameDim(input, *output) && input.NumCols() == dim && + int32(posteriors.size()) == input.NumRows()); + KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 && + num_frames % num_chunks == 0); + int32 chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + MeanOnlyTransformMinibatchInfo *ans = new MeanOnlyTransformMinibatchInfo(num_classes, + dim, num_spk); + + // The input is in CuMatrix, i.e. it's on the GPU if we're using a GPU. For + // now we just transfer everything to CPU, which of course is not optimal; we + // may later implement some of the deeper parts of this on GPU if the methods + // turn out to be effective. + Matrix input_cpu(input), + output_cpu(num_frames, dim, kUndefined); + + // First estimate the target model (Gaussian means and spherical variances). + // We use the default options: they only affect the variances, which we won't + // be using. + ans->target_model.AccStats(input_cpu, posteriors); + FmllrEstimatorOptions default_opts; + ans->target_model.Estimate(default_opts); + + for (int32 s = 0; s < num_spk; s++) + ans->estimators[s] = new MeanOnlyTransformEstimator( + ans->target_model.GetMeans()); + + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input(input_cpu.RowData(chunk), + frames_per_chunk, // num-rows + dim, // num-cols + input_cpu.Stride() * num_chunks); // stride + SubPosterior this_posteriors(posteriors, + chunk, // offset + frames_per_chunk, // num_frames + num_chunks); // stride + ans->estimators[speaker]->AccStats(this_input, this_posteriors); + } + for (int32 s = 0; s < num_spk; s++) + ans->estimators[s]->Estimate(); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix + this_input(input_cpu.RowData(chunk), frames_per_chunk, dim, + input_cpu.Stride() * num_chunks), + this_output(output_cpu.RowData(chunk), + frames_per_chunk, dim, output_cpu.Stride() * num_chunks); + ans->estimators[speaker]->AdaptFeatures(this_input, &this_output); + } + output->CopyFromMat(output_cpu); + return ans; +} + +void MeanOnlyTransform::TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const { + MeanOnlyTransformMinibatchInfo *info = + dynamic_cast(minibatch_info); + KALDI_ASSERT(info != NULL && "Wrong type of minibatch info supplied."); + + int32 dim = dim_, num_frames = input.NumRows(); + KALDI_ASSERT(SameDim(input, output_deriv) && input.NumCols() == dim && + SameDim(input, *input_deriv) && + int32(posteriors.size()) == input.NumRows()); + KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 && + num_frames % num_chunks == 0); + int32 chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + // For now we just transfer everything to the CPU. + Matrix input_cpu(input), + output_deriv_cpu(output_deriv), + input_deriv_cpu(num_frames, dim); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input( + input_cpu.RowData(chunk), frames_per_chunk, + dim, input_cpu.Stride() * num_chunks), + this_output_deriv(output_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + output_deriv_cpu.Stride() * num_chunks), + this_input_deriv(input_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + input_deriv_cpu.Stride() * num_chunks); + info->estimators[speaker]->AdaptFeaturesBackward( + this_input, this_output_deriv, &this_input_deriv); + } + + for (int32 s = 0; s < num_spk; s++) + info->estimators[s]->EstimateBackward(); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input( + input_cpu.RowData(chunk), frames_per_chunk, + dim, input_cpu.Stride() * num_chunks), + this_output_deriv(output_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + output_deriv_cpu.Stride() * num_chunks), + this_input_deriv(input_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + input_deriv_cpu.Stride() * num_chunks); + SubPosterior this_posteriors(posteriors, chunk, + frames_per_chunk, num_chunks); + info->estimators[speaker]->AccStatsBackward( + this_input, this_posteriors, &this_input_deriv); + } + + for (int32 s = 0; s < num_spk; s++) { + Vector var_derivs(num_classes_); // zero. + info->target_model.AddToOutputDerivs(info->estimators[s]->GetMeanDeriv(), + var_derivs); + } + + info->target_model.AccStatsBackward(input_cpu, posteriors, &input_deriv_cpu); + input_deriv->CopyFromMat(input_deriv_cpu); + + delete info; +} + + +void MeanOnlyTransform::Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) { + KALDI_ASSERT(final_iter == 0); + if (target_model_ == NULL) + target_model_ = new GaussianEstimator(num_classes_, dim_); + Matrix input_cpu(input); + target_model_->AccStats(input_cpu, posteriors); +} + +SpeakerStatsItf *MeanOnlyTransform::GetEmptySpeakerStats() const { + KALDI_ASSERT(target_model_ != NULL && + target_model_->GetMeans().NumRows() != 0 && + "You're trying to do adaptation with speaker transforms on " + "which you haven't done the final phase of training."); + return new MeanOnlyTransformSpeakerStats(target_model_->GetMeans()); +} + +void MeanOnlyTransform::TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const { + MeanOnlyTransformSpeakerStats *stats = dynamic_cast( + speaker_stats); + KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied."); + stats->estimator.AccStats(input, posteriors); +} + +void MeanOnlyTransform::TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const { + const MeanOnlyTransformSpeakerStats *stats = dynamic_cast( + &speaker_stats); + KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied."); + KALDI_ASSERT(stats->estimator.IsEstimated() && + "You can't call TestingForward() without calling Estimate() on " + "the speaker stats."); + stats->estimator.AdaptFeatures(input, output); +} + +MeanOnlyTransform::~MeanOnlyTransform() { + delete target_model_; +} + + + +} // namespace differentiable_transform +} // namespace kaldi diff --git a/src/adapt/differentiable-transform.h b/src/adapt/differentiable-transform.h index 98fcaf11086..dce197e9444 100644 --- a/src/adapt/differentiable-transform.h +++ b/src/adapt/differentiable-transform.h @@ -24,270 +24,35 @@ #include #include "base/kaldi-common.h" -#include "util/kaldi-table.h" -#include "util/kaldi-holder.h" - -namespace kaldi { +#include "matrix/kaldi-matrix.h" +#include "cudamatrix/cu-matrix.h" +#include "adapt/differentiable-transform-itf.h" +#include "adapt/differentiable-fmllr.h" +// This header contains the 'base-cases' of DifferentiableTransform: namely, +// FmllrTransform and MeanOnlyTransform. See also generic-transform.h where +// sequence, append and no-op types are defined. +namespace kaldi { namespace differentiable_transform { -class MinibatchInfoItf { - public: - - virtual ~MinibatchInfoItf() { } -}; - - -class SpeakerStatsItf { - - virtual ~SpeakerStatsItf() { } -}; - - /** - This class is for speaker-dependent feature-space transformations -- - principally various varieties of fMLLR, including mean-only, diagonal and - block-diagonal versions -- which are intended for placement in the bottleneck - of a neural net. So code-wise, we'd have: bottom neural net, then transform, - then top neural net. The transform is designed to be differentiable, i.e. it - can be used during training to propagate derivatives from the top neural net - down to the bottom neural net. The reason this is non-trivial (i.e. why it's - not just a matrix multiplication) is that the value of the transform itself - depends on the features, and also on the speaker-independent statistics for - each class (i.e. the mean and variance), which also depend on the features - sicne we estimate them from the same minibatch. - You can view this as an extension of things like BatchNorm, except the - interface is more complicated because there is a dependence on the per-frame - class labels. - - The class labels we'll use here will probably be derived from some kind of - minimal tree, with hundreds instead of thousands of states. Part of the - reason for using a smaller number of states is that, to make the thing - properly differentiable during training, we need to use a small enough number - of states that we can obtain a reasonable estimate for the mean and (spherical) - variance of a Gaussian for each one in training time. Anyway, as you can see in - http://isl.anthropomatik.kit.edu/pdf/Nguyen2017.pdf, it's generally better - for this kind of thing to use "simple target models" for adaptation rather than - very complex models. - - Note: for training utterances we'll generally get the class labels used for - adatpation in a supervised manner, either by aligning a previous system like - a GMM system, or-- more likely-- from the (soft) posteriors of the the - numerator graphs. In test time, we'll usually be getting these class labels - from some kind of unsupervised process. - - Because we tend to train neural nets on fairly small fixed-size chunks - (e.g. 1.5 seconds), and transforms like fMLLR don't tend to work very well - until you have about 5 seconds of data, we will usually be arranging those - chunks into groups where all members of the group come from the same - speaker. - */ -class DifferentiableTransform { - public: - - /// Return the dimension of the features this operates on. - virtual int32 Dim() const = 0; - - /// Return the number of classes in the model used for adaptation. These - /// will probably correspond to the leaves of a small tree, so they would - /// be pdf-ids. This model only keeps track of the number of classes, - /// it does not contain any information about what they mean. The - /// integers in the objects of type Posterior provided to this class - /// are expected to contain numbers from 0 to NumClasses() - 1. - int32 NumClasses() const { return num_classes_; } - - - /// This can be used to change the number of classes. It would normally be - /// used, if at all, after the model is trained and prior to calling - /// Accumulate(), in case you want to use a more detailed model (e.g. the - /// normal-size tree instead of the small one that we use during training). - /// Child classes may want to override this, in case they need to do - /// something more than just set this variable. - virtual void SetNumClasses(int32 num_classes) { num_classes_ = num_classes; } - - /** - This is the function you call in training time, for the forward - pass; it adapts the features. By "training time" here, we - assume you are training the 'bottom' neural net, that produces - the features in 'input'; if you were not training it, it would - be the same as test time as far as this function is concerned. - - @param [in] input The original, un-adapted features; these - will typically be output by a neural net, the 'bottom' net in our - terminology. This will correspond to a whole minibatch, - consisting of multiple speakers and multiple sequences (chunks) - per speaker. Caution: the order of both the input and - output features, and the posteriors, does not consist of blocks, - one per sequence, but rather blocks, one per time frame, so the - sequences are intercalated. This is the default order; - see operator < of nnet3::Index. - @param [in] num_chunks The number of individual sequences - (e.g., chunks of speech) represented in 'input'. - input.NumRows() will equal num_sequences times the number - of time frames. - @param [in] num_spk The number of speakers. Must be greater than one, and - must divide num_chunks. The number of chunks per speaker - (num_chunks / num_spk) must be the same for all speakers, and the - chunks for a speaker must be consecutive. - @param [in] posteriors (note: this is a vector of vector of - pair). This provides, in 'soft-count' - form, the class supervision information that is used for the - adaptation. posteriors.size() will be equal to input.NumRows(), - and the ordering of its elements is the same as the ordering - of the rows of input, i.e. the sequences are intercalated. - There is no assumption that the posteriors sum to one; - this allows you to do things like silence weighting. - @param [out] output The adapted output. This matrix should have the - same dimensions as 'input'. It does not have to be free of - NaNs when you call this function. - @return This function returns either NULL or an object of type - DifferentiableTransformItf*, which is expected to later be given - to the function TrainingBackward(). It will store - any information that needs to be remembered for the backward - phase. - */ - virtual MinibatchInfoItf* TrainingForward( - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - CuMatrixBase *output) const = 0; - - - /** - This does the backpropagation, during the training pass. - - @param [in] input The original input (pre-transform) features that - were given to TrainingForward(). - @param [in] output_deriv The derivative of the objective function - (that we are backpropagating) w.r.t. the output. - @param [in] num_chunks,num_spk,posteriors - See TrainingForward() for information - about these arguments; they should be the same - values. - @param [in] minibatch_info The pointer returned by the corresponding - call to TrainingForward() (may be NULL). This function - takes possession of the pointer. If for some reason the - backward pass was not done, the caller will likely - want to delete it themselves. - @param [in,out] input_deriv The derivative at the input, i.e. - dF/d(input), where F is the function we are - evaluating. Must have the same dimension as - 'input'. The derivative is *added* to here. - This is useful because generally we will also - be training (perhaps with less weight) on - the unadapted features, in order to prevent them - from deviating too far from the adapted ones - and to allow the same model to be used for the - first pass. - */ - virtual void TrainingBackward( - const CuMatrixBase &input, - const CuMatrixBase &output_deriv, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - MinibatchInfoItf *minibatch_info, - CuMatrixBase *input_deriv) const = 0; - - - /** - Returns the number of times you have to (call Accumulate() on a subset - of data, then call Estimate()) - */ - virtual int32 NumFinalIterations() = 0; - - /** - This will typically be called sequentially, minibatch by minibatch, - for a subset of training data, after training the neural nets, - followed by a call to Estimate(). Accumulate() stores statistics - that are used by Estimate(). This process is analogous to - computing the final stats in BatchNorm, in preparation for testing. - In practice it will be doing things like computing per-class means - and variances. - - @param [in] final_iter An iteration number in the range - [0, NumFinalIterations()]. In many cases there will - be only one iteration so this will just be zero. - - The input parameters are the same as the same-named parameters to - TrainingForward(); please refer to the documentation there. - */ - virtual void Accumulate( - int32 final_iter, - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors) = 0; - - // To be called after repeated calls to Accumulate(), does any estimation that - // is required in training time (normally per-speaker means and possibly - // variances. - // @param [in] final_iter An iteration number in the range - // [0, NumFinalIterations()]. In many cases there will - // be only one iteration so this will just be zero. - virtual void Estimate(int32 final_iter) = 0; - - // Returns an object representing sufficient statistics for estimating a - // speaker-dependent transform. This object will initially have zero - // counts in its statistics. It will represent the stats for a single - // speaker. - virtual SpeakerStatsItf *GetEmptySpeakerStats() = 0; - - - // Accumulate statistics for a segment of test data, storing them in the - // object 'speaker_stats'. There is no assumption that the soft-counts in - // 'posteriors' are positive; this allows you to change your mind about the - // traceback, in test-time, by subtracting the stats that you no longer want - // to use. - virtual void TestingAccumulate( - const MatrixBase &input, - const Posterior &posteriors, - SpeakerStatsItf *speaker_stats) const = 0; - - // Applies the transformation implied by the statistics in 'speaker_stats' to - // 'input', storing in the result in 'output'. It will do any estimation - // procedure that is required first, if applicable. - virtual void TestingForward( - const MatrixBase &input, - const SpeakerStatsItf &speaker_stats, - MatrixBase *output) const = 0; - - - // Read transform from stream (works out its type). Dies on error. - static DifferentiableTransform* ReadNew(std::istream &is, bool binary); - - // Copies transform (deep copy). - virtual DifferentiableTransform* Copy() const = 0; - - // Returns a new transform of the given type e.g. "MeanNormalize", - // or NULL if no such component type exists. - static DifferentiableTransform *NewTransformOfType(const std::string &type); - - // Write transform to stream - virtual void Write(std::ostream &os, bool binary) const = 0; - - // Reads transform from stream (normally you would previously have created - // the transform object of the correct type using ReadNew(). - virtual void Read(std::istream &is, bool binary) = 0; - - protected: - int32 num_classes_; + This is a version of the transform class that implements fMLLR (with + spherical variances, to make the update equations non-iterative); see + differentiable-fmllr.h where the core parts of this are implemented, + this provides the interface compatible with DifferentiableTransform. + Please see the comments in class DifferentiableTransform (in + differentiable-transform-itf.h) for the meaning and usage of the various + interface functions and their parameters. +*/ +class FmllrTransform: public DifferentiableTransform { -}; + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; -/** - This is a version of the transform class that does nothing. It's potentially - useful for situations where you want to apply speaker normalization to some - dimensions of the feature vector but not to others. - */ -class NoOpTransform: public DifferentiableTransform { - public: - int32 Dim() const override { return dim_; } MinibatchInfoItf* TrainingForward( @@ -295,342 +60,131 @@ class NoOpTransform: public DifferentiableTransform { int32 num_chunks, int32 num_spk, const Posterior &posteriors, - CuMatrixBase *output) const override { - output->CopyFromMat(input); - return NULL; - } + CuMatrixBase *output) const override; + virtual void TrainingBackward( const CuMatrixBase &input, const CuMatrixBase &output_deriv, int32 num_chunks, int32 num_spk, const Posterior &posteriors, - const MinibatchInfoItf *minibatch_info, - CuMatrixBase *input_deriv) const override { - KALDI_ASSERT(minibatch_info == NULL); - input_deriv->AddMat(1.0, output_deriv); - } - - virtual int32 NumFinalIterations() { return 0; } + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override; void Accumulate( int32 final_iter, const CuMatrixBase &input, int32 num_chunks, int32 num_spk, - const Posterior &posteriors) override { } + const Posterior &posteriors) override; + void Estimate(int32 final_iter) override; + + int32 NumFinalIterations() override { return 1; } - SpeakerStatsItf *GetEmptySpeakerStats() override { return NULL; } + SpeakerStatsItf *GetEmptySpeakerStats() const override; void TestingAccumulate( const MatrixBase &input, - const Posterior &posteriors, - SpeakerStatsItf *speaker_stats) const override { } + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; void TestingForward( const MatrixBase &input, const SpeakerStatsItf &speaker_stats, - MatrixBase *output) override { - output->CopyFromMat(input); - } + MatrixBase *output) const override; - void Estimate(int32 final_iter) override { } + FmllrTransform(const FmllrTransform &other); - NoOpTransform(const NoOpTransform &other): - DifferentiableTransform(other), - dim_(other.dim_) { } + FmllrTransform(): target_model_(NULL) { } - DifferentiableTransform* Copy() const override { - return new NoOpTransform(*this); - } + DifferentiableTransform* Copy() const override; void Write(std::ostream &os, bool binary) const override; void Read(std::istream &is, bool binary) override; + ~FmllrTransform(); private: int32 dim_; -}; - - -/** - This is a version of the transform class that does a sequence of other - transforms, specified by other instances of the DifferentiableTransform - interface. - - TODO: finish this. - */ -class SequenceTransform: public DifferentiableTransform { - public: - - int32 Dim() const override; - int32 SetNumClasses() const override; - - MinibatchInfoItf* TrainingForward( - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - CuMatrixBase *output) const override; - virtual void TrainingBackward( - const CuMatrixBase &input, - const CuMatrixBase &output_deriv, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - const MinibatchInfoItf &minibatch_info, - CuMatrixBase *input_deriv) const override; - virtual int32 NumFinalIterations(); + FmllrEstimatorOptions fmllr_opts_; - void Accumulate( - int32 final_iter, - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors) override; - - SpeakerStatsItf *GetEmptySpeakerStats() override; - - void TestingAccumulate( - const MatrixBase &input, - const Posterior &posteriors, - SpeakerStatsItf *speaker_stats) const override; - - virtual void TestingForward( - const MatrixBase &input, - const SpeakerStatsItf &speaker_stats, - MatrixBase *output) override; - - void Estimate(int32 final_iter) override; - - SequenceTransform(const SequenceTransform &other); - - DifferentiableTransform* Copy() const override { - return new SequenceTransform(*this); - } - - void Write(std::ostream &os, bool binary) const override; - - void Read(std::istream &is, bool binary) override; - - private: - std::vector transforms_; + // Note: this target model is only for use in test time. We allocate it the + // first time Accumulate() is called. In training time we estimate it + // minibatch by minibatch (which is why we don't expect to have that many + // classes). At the end of training we'll accumulate stats here in + // Accumulate(), and Estimate() will estimate it. + GaussianEstimator *target_model_; }; - -/** - This is a version of the transform class that consists of a number of other - transforms, appended dimension-wise, so its feature dimension is the sum of - the dimensions of the constituent transforms-- e.g. this could be used to - implement block-diagonal fMLLR, or a structure where some dimensions are - adapted and some are not. - */ -class AppendTransform: public DifferentiableTransform { +class FmllrMinibatchInfo: public MinibatchInfoItf { public: - int32 Dim() const override; - int32 SetNumClasses() const override; - - MinibatchInfoItf* TrainingForward( - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - CuMatrixBase *output) const override; - virtual void TrainingBackward( - const CuMatrixBase &input, - const CuMatrixBase &output_deriv, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - MinibatchInfoItf *minibatch_info, - CuMatrixBase *input_deriv) const override; - - virtual int32 NumFinalIterations(); - - void Accumulate( - int32 final_iter, - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors) override; - - virtual void TestingForward( - const MatrixBase &input, - const SpeakerStatsItf &speaker_stats, - MatrixBase *output) override; - - void Estimate(int32 final_iter) override; - - AppendTransform(const AppendTransform &other); + FmllrMinibatchInfo(int32 num_classes, int32 dim, int32 num_speakers); - DifferentiableTransform* Copy() const override { - return new AppendTransform(*this); - } - - void Write(std::ostream &os, bool binary) const override; + GaussianEstimator target_model; - void Read(std::istream &is, bool binary) override; + // One estimator of Fmllr per speaker. Make them pointers so we don't have to + // implement self-constructor for class FmllrEstimator. + std::vector estimators; - private: - std::vector transforms_; + ~FmllrMinibatchInfo(); }; +class FmllrSpeakerStats: public SpeakerStatsItf { + public: + // Caution: this object maintains references to mu and s, so it's not a good + // idea to let the target-model (which lives in the FmllrTransform object) be + // deleted during the lifetime of this object. + FmllrSpeakerStats(const FmllrEstimatorOptions &opts, + const MatrixBase &mu, + const VectorBase &s): + estimator(opts, mu, s) { } + void Estimate() override { estimator.Estimate(); } -/** - This is a version of the transform class that implements fMLLR (with - spherical variances, to make the update equations non-iterative); see - differentiable-fmllr.h. -*/ -class FmllrTransform: public DifferentiableTransform { - int32 Dim() const override; - int32 NumClasses() const override; - MinibatchInfoItf* TrainingForward( - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - CuMatrixBase *output) const override { - output->CopyFromMat(input); - return NULL; - } - virtual void TrainingBackward( - const CuMatrixBase &input, - const CuMatrixBase &output_deriv, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - const MinibatchInfoItf &minibatch_info, - CuMatrixBase *input_deriv) const override; - - void Accumulate( - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors) override; - - void Estimate() override { } - - AppendTransform(const AppendTransform &other); - - DifferentiableTransform* Copy() const override; - - void Write(std::ostream &os, bool binary) const override; - - void Read(std::istream &is, bool binary) override; - private: - int32 dim_; - - // TODO: class means and variances for when the model has been trained. + FmllrEstimator estimator; + ~FmllrSpeakerStats() { } }; - /** This version of the transform class does a mean normalization: adding an offset to its input so that the difference (per speaker) of the transformed class means from the speaker-independent class means is minimized. This is like a mean-only fMLLR with fixed (say, unit) covariance model. */ -class SimpleMeanTransform: public DifferentiableTransform { - public: - int32 Dim() const override; - int32 NumClasses() const override; - MinibatchInfoItf* TrainingForward( - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - CuMatrixBase *output) const override { - output->CopyFromMat(input); - return NULL; - } - virtual void TrainingBackward( - const CuMatrixBase &input, - const CuMatrixBase &output_deriv, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors, - const MinibatchInfoItf &minibatch_info, - CuMatrixBase *input_deriv) const override; - - void Accumulate( - const CuMatrixBase &input, - int32 num_chunks, - int32 num_spk, - const Posterior &posteriors) override; - - virtual void TestingForward( - const MatrixBase &input, - const SpeakerStatsItf &speaker_stats, - MatrixBase *output) override; - - - void Estimate() override { } - - AppendTransform(const AppendTransform &other); - - DifferentiableTransform* Copy() const override; - - void Write(std::ostream &os, bool binary) const override; - - void Read(std::istream &is, bool binary) override; - private: - - // OK: how to compute stats - class MinibatchInfo: public MinibatchInfoItf { +class MeanOnlyTransform: public DifferentiableTransform { - // Stores the total weights, per frame, that correspond to the Posteriors - // supplied to TrainingForward(). - CuVector frame_weights; - // The total of frame_weights. - BaseFloat total_weight; - }; - - // dim_ is the feature dimension - int32 dim_; - - // The class-dependent means. Dimension is num_classes_ by dim_. - // Note: these will not be set up during training, they will only - // be set up after calling Accumulate() and Estimate(), which happens - // in test time. - CuMatrix means_; - - // mean_stats_ and count_ are used in Accumulate() to accumulate - // statistics to adapt the mean. - CuMatrix mean_stats_; - double count_; - -}; + /* + Example config line: + MeanOnlyTransform dim=100 + */ + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; -/** - Notes on the math behind differentiable fMLLR transform. - */ + int32 Dim() const override { return dim_; } -class FmllrTransform: public DifferentiableTransform { - public: - int32 Dim() const override; - int32 NumClasses() const override; MinibatchInfoItf* TrainingForward( const CuMatrixBase &input, int32 num_chunks, int32 num_spk, const Posterior &posteriors, CuMatrixBase *output) const override; + virtual void TrainingBackward( const CuMatrixBase &input, const CuMatrixBase &output_deriv, int32 num_chunks, int32 num_spk, const Posterior &posteriors, - const MinibatchInfoItf &minibatch_info, + MinibatchInfoItf *minibatch_info, CuMatrixBase *input_deriv) const override; + void Accumulate( int32 final_iter, const CuMatrixBase &input, @@ -638,64 +192,78 @@ class FmllrTransform: public DifferentiableTransform { int32 num_spk, const Posterior &posteriors) override; - SpeakerStatsItf *GetEmptySpeakerStats() override; + void Estimate(int32 final_iter) override; + + int32 NumFinalIterations() override { return 1; } + + SpeakerStatsItf *GetEmptySpeakerStats() const override; void TestingAccumulate( const MatrixBase &input, - const Posterior &posteriors, + const SubPosterior &posteriors, SpeakerStatsItf *speaker_stats) const override; - virtual void TestingForward( + void TestingForward( const MatrixBase &input, const SpeakerStatsItf &speaker_stats, - MatrixBase *output) override; + MatrixBase *output) const override; - void Estimate(int32 final_iter) override { } + MeanOnlyTransform(const MeanOnlyTransform &other); - FmllrTransform(const FmllrTransform &other); + MeanOnlyTransform(): target_model_(NULL) { } DifferentiableTransform* Copy() const override; void Write(std::ostream &os, bool binary) const override; void Read(std::istream &is, bool binary) override; + + ~MeanOnlyTransform(); private: + int32 dim_; - // OK: how to compute stats - class MinibatchInfo: public MinibatchInfoItf { + // Note: this target model is only for use in test time. We allocate it the + // first time Accumulate() is called. In training time we estimate it + // minibatch by minibatch (which is why we don't expect to have that many + // classes). At the end of training we'll accumulate stats here in + // Accumulate(), and Estimate() will estimate it. + GaussianEstimator *target_model_; +}; - // Stores the total weights, per frame, that correspond to the Posteriors - // supplied to TrainingForward(). frame_weights.Dim() equals - // input.NumRows(). - CuVector frame_weights; +class MeanOnlyTransformMinibatchInfo: public MinibatchInfoItf { + public: - // The total of frame_weights per speaker. - CuVector frame_weights; + MeanOnlyTransformMinibatchInfo(int32 num_classes, int32 dim, + int32 num_speakers); - BaseFloat total_weight; - }; + GaussianEstimator target_model; - class SpeakerStats: public SpeakerStatsItf { + // One estimator of offset per speaker. Make them pointers so we don't have to + // implement self-constructor for class FmllrEstimator. + std::vector estimators; - }; + ~MeanOnlyTransformMinibatchInfo(); +}; - // dim_ is the feature dimension - int32 dim_; +class MeanOnlyTransformSpeakerStats: public SpeakerStatsItf { + public: + // Caution: this object maintains a reference to mu, so it's not a good idea + // to let the target-model (which lives in the FmllrTransform object) be + // deleted during the lifetime of this object. + MeanOnlyTransformSpeakerStats(const MatrixBase &mu): + estimator(mu) { } - // The class-dependent means. Dimension is num_classes_ by dim_. - // Note: these will not be set up during training, they will only - // be set up after calling Accumulate() and Estimate(), which happens - // in test time. - CuMatrix means_; + void Estimate() override { estimator.Estimate(); } - // mean_stats_ and count_ are used in Accumulate() to accumulate - // statistics to adapt the mean. - CuMatrix mean_stats_; - double count_; + MeanOnlyTransformEstimator estimator; + ~MeanOnlyTransformSpeakerStats() { } }; + + + } // namespace differentiable_transform } // namespace kaldi diff --git a/src/adapt/generic-transform.h b/src/adapt/generic-transform.h new file mode 100644 index 00000000000..3e75db9885b --- /dev/null +++ b/src/adapt/generic-transform.h @@ -0,0 +1,315 @@ +// adapt/generic-transform.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ +#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ + +#include +#include "base/kaldi-common.h" +#include "matrix/kaldi-matrix.h" +#include "cudamatrix/cu-matrix.h" +#include "adapt/differentiable-transform-itf.h" + +// This header contains 'generic' forms of differentiable transform, which allow +// you to append more basic transforms together or concatenate them dimension-wise. +// Also it includes a no-op transform. + +namespace kaldi { +namespace differentiable_transform { + + +/** + This is a version of the transform class that does nothing. It's potentially + useful for situations where you want to apply speaker normalization to some + dimensions of the feature vector but not to others. + */ +class NoOpTransform: public DifferentiableTransform { + public: + + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; + + int32 Dim() const override { return dim_; } + + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override { + output->CopyFromMat(input); + return NULL; + } + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override { + KALDI_ASSERT(minibatch_info == NULL); + input_deriv->AddMat(1.0, output_deriv); + } + + virtual int32 NumFinalIterations() { return 0; } + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override { } + + + SpeakerStatsItf *GetEmptySpeakerStats() const override { return NULL; } + + void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override { } + + void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const override { + output->CopyFromMat(input); + } + + void Estimate(int32 final_iter) override { } + + + NoOpTransform(): dim_(-1) { } + + NoOpTransform(const NoOpTransform &other): + DifferentiableTransform(other), + dim_(other.dim_) { } + + DifferentiableTransform* Copy() const override { + return new NoOpTransform(*this); + } + + std::string Type() const override { return "NoOpTransform"; } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + private: + int32 dim_; +}; + + +/** + This is a version of the transform class that does a sequence of other + transforms, specified by other instances of the DifferentiableTransform + interface. For instance: fMLLR followed by another fMLLR, or mean normalization + followed by fMLLR. The reason this might make sense is that you'd get a better + estimate of the speaker-adapted class means if you do some kind of speaker + normalization before estimating those class means. + + Caution: the framework currently implicitly assumes that the + final one of the supplied transforms subsumes the previous ones + (as in fMLLR subsumes mean subtraction, or fMLLR subsumes a previous + fMLLR of the same dimension). This means that in test time the + first of the two transforms may be ignored and only the second one + performed. This is in order to keep a single-pass adaptation framework + in test time. The sequence of transforms still makes a difference + because it affects how we compute the adaptation model (i.e., it's + more like a speaker-adapted model than a speaker independent model, + to use traditional ASR terminology). + */ +class SequenceTransform: public DifferentiableTransform { + public: + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; + + int32 Dim() const override; + void SetNumClasses(int32 num_classes) override; + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override; + + int32 NumFinalIterations() override; + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + void Estimate(int32 final_iter) override; + + SpeakerStatsItf *GetEmptySpeakerStats() const override { + // See comment at the top of this class for an explanation. + return transforms_.back()->GetEmptySpeakerStats(); + } + + void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; + + void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const override; + + SequenceTransform(const SequenceTransform &other); + + SequenceTransform() { } + + DifferentiableTransform* Copy() const override { + return new SequenceTransform(*this); + } + + std::string Type() const override { return "SequenceTransform"; } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + ~SequenceTransform() override; + private: + std::vector transforms_; +}; + +// This is the type actually returned by TrainingForward() for SequenceTransform. +// It contains a list of other MinibatchInfo, together with the outputs for all +// but the last call. +class SequenceMinibatchInfo: public MinibatchInfoItf { + public: + std::vector info_vec; + // outputs.size() will be info.size() - 1. + std::vector > outputs; + + ~SequenceMinibatchInfo() override; +}; + + +class AppendSpeakerStats: public SpeakerStatsItf { + public: + AppendSpeakerStats() { } + + std::vector stats; + + void Estimate() override; + + ~AppendSpeakerStats(); +}; + +/** + This is a version of the transform class that consists of a number of other + transforms, appended dimension-wise, so its feature dimension is the sum of + the dimensions of the constituent transforms-- e.g. this could be used to + implement block-diagonal fMLLR, or a structure where some dimensions are + adapted and some are not. + */ +class AppendTransform: public DifferentiableTransform { + public: + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; + + int32 Dim() const override; + void SetNumClasses(int32 num_classes) override; + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override; + + int32 NumFinalIterations() override; + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + SpeakerStatsItf *GetEmptySpeakerStats() const override; + + void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; + + virtual void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const override; + + void Estimate(int32 final_iter) override; + + AppendTransform(const AppendTransform &other); + + AppendTransform() { } + + DifferentiableTransform* Copy() const override { + return new AppendTransform(*this); + } + + std::string Type() const override { return "AppendTransform"; } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + ~AppendTransform(); + private: + std::vector transforms_; +}; + + +// This is the type created by TrainingForward() for AppendTransform. +// It just contains a list of other MinibatchInfo. +class AppendMinibatchInfo: public MinibatchInfoItf { + public: + std::vector info_vec; + + ~AppendMinibatchInfo() override; +}; + + +} // namespace differentiable_transform +} // namespace kaldi + +#endif // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h index ca476033950..ece0a79bc48 100644 --- a/src/base/io-funcs.h +++ b/src/base/io-funcs.h @@ -106,7 +106,7 @@ namespace kaldi { it doesn't throw. It's useful if a class can have various forms based on typedefs and virtual classes, and wants to know which version to read. - ReadToken allow the caller to obtain the next token. PeekToken works just + ReadToken allows the caller to obtain the next token. PeekToken works just like ReadToken, but seeks back to the beginning of the token. A subsequent call to ReadToken will read the same token again. This is useful when different object types are written to the same file; using PeekToken one can diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h index 0c255845dd5..1c3e9efd38e 100644 --- a/src/hmm/posterior.h +++ b/src/hmm/posterior.h @@ -52,6 +52,36 @@ typedef std::vector > > Posterior; typedef std::vector > > > GaussPost; +/// This class allows you to select a sub-vector of Posteriors, possibly with a +/// stride, without copying them elsewhere. SubPosterior is to Posterior as +/// SubVector is to Vector. (Note: Posterior is actually a typedef to +/// std::vector > >. +/// We can add a non-const interface later if needed. +class SubPosterior { + public: + SubPosterior(const Posterior &post): + num_frames_(post.size()), stride_(1), data_( + num_frames_ == 0 ? NULL : &(post[0])) { } + SubPosterior(const Posterior &post, size_t offset, + size_t num_frames, size_t stride = 1): + num_frames_(num_frames), stride_(stride), + data_(num_frames_ == 0 ? NULL : &(post[offset])) { + KALDI_ASSERT(stride > 0 && post.size() > offset + (num_frames-1) * stride); + } + size_t size() const { return num_frames_; } + const std::vector > &operator[] (size_t i) const { + KALDI_PARANOID_ASSERT(i < num_frames_); + return data_[i * stride_]; + } + SubPosterior(const SubPosterior &other) = default; + private: + size_t num_frames_; + size_t stride_; + const std::vector > *data_; +}; + + + // PosteriorHolder is a holder for Posterior, which is // std::vector > > // This is used for storing posteriors of transition id's for an diff --git a/src/nnet3/nnet-parse-test.cc b/src/nnet3/nnet-parse-test.cc index babdbbdcb0e..5ae4917dba6 100644 --- a/src/nnet3/nnet-parse-test.cc +++ b/src/nnet3/nnet-parse-test.cc @@ -23,193 +23,6 @@ namespace kaldi { namespace nnet3 { -void UnitTestConfigLineParse() { - std::string str; - { - ConfigLine cfl; - str = "a-b xx=yyy foo=bar baz=123 ba=1:2"; - bool status = cfl.ParseLine(str); - KALDI_ASSERT(status && cfl.FirstToken() == "a-b"); - - KALDI_ASSERT(cfl.HasUnusedValues()); - std::string str_value; - KALDI_ASSERT(cfl.GetValue("xx", &str_value)); - KALDI_ASSERT(str_value == "yyy"); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(cfl.GetValue("foo", &str_value)); - KALDI_ASSERT(str_value == "bar"); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(!cfl.GetValue("xy", &str_value)); - KALDI_ASSERT(cfl.GetValue("baz", &str_value)); - KALDI_ASSERT(str_value == "123"); - - std::vector int_values; - KALDI_ASSERT(!cfl.GetValue("xx", &int_values)); - KALDI_ASSERT(cfl.GetValue("baz", &int_values)); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123); - KALDI_ASSERT(cfl.GetValue("ba", &int_values)); - KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2); - KALDI_ASSERT(!cfl.HasUnusedValues()); - } - - { - ConfigLine cfl; - str = "a-b baz=x y z pp = qq ab =cd ac= bd"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "a-b baz=x y z pp = qq ab=cd ac=bd"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "foo-bar"; - KALDI_ASSERT(cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "foo-bar a=b c d f=g"; - std::string value; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" && - cfl.GetValue("a", &value) && value == "b c d" && - cfl.GetValue("f", &value) && value == "g" && - !cfl.HasUnusedValues()); - } - { - ConfigLine cfl; - str = "zzz a=b baz"; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" && - cfl.UnusedValues() == "a=b baz"); - } - { - ConfigLine cfl; - str = "xxx a=b baz "; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz"); - } - { - ConfigLine cfl; - str = "xxx a=b =c"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "xxx baz='x y z' pp=qq ab=cd ac=bd"; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx"); - std::string str_value; - KALDI_ASSERT(cfl.GetValue("baz", &str_value)); - KALDI_ASSERT(str_value == "x y z"); - KALDI_ASSERT(cfl.GetValue("pp", &str_value)); - KALDI_ASSERT(str_value == "qq"); - KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd"); - KALDI_ASSERT(cfl.GetValue("ab", &str_value)); - KALDI_ASSERT(str_value == "cd"); - KALDI_ASSERT(cfl.UnusedValues() == "ac=bd"); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(cfl.GetValue("ac", &str_value)); - KALDI_ASSERT(str_value == "bd"); - KALDI_ASSERT(!cfl.HasUnusedValues()); - } - - { - ConfigLine cfl; - str = "x baz= pp = qq flag=t "; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = " x baz= pp=qq flag=t "; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x"); - - std::string str_value; - KALDI_ASSERT(cfl.GetValue("baz", &str_value)); - KALDI_ASSERT(str_value == ""); - KALDI_ASSERT(cfl.GetValue("pp", &str_value)); - KALDI_ASSERT(str_value == "qq"); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(cfl.GetValue("flag", &str_value)); - KALDI_ASSERT(str_value == "t"); - KALDI_ASSERT(!cfl.HasUnusedValues()); - - bool bool_value = false; - KALDI_ASSERT(cfl.GetValue("flag", &bool_value)); - KALDI_ASSERT(bool_value); - } - - { - ConfigLine cfl; - str = "xx _baz=a -pp=qq"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "xx 0baz=a pp=qq"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "xx -baz=a pp=qq"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "xx _baz'=a pp=qq"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = " baz=g"; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == ""); - bool flag; - KALDI_ASSERT(!cfl.GetValue("baz", &flag)); - } - { - ConfigLine cfl; - str = "xx _baz1=a pp=qq"; - KALDI_ASSERT(cfl.ParseLine(str)); - - std::string str_value; - KALDI_ASSERT(cfl.GetValue("_baz1", &str_value)); - } -} - -void UnitTestReadConfig() { - std::string str = "a-b alpha=aa beta=\"b b\"# String test\n" - "a-b beta2='b c' beta3=bd # \n" - "a-b gamma=1:2:3:4 # Int Vector test\n" - " a-b de1ta=f # Bool + Integer in key Comment test delta=t \n" - "a-b _epsilon=-1 # Int Vector test _epsilon=1 \n" - "a-b zet-_a=0.15 theta=1.1# Float, -, _ test\n" - "a-b quoted='a b c' # quoted string\n" - "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes"; - - std::istringstream is(str); - std::vector lines; - ReadConfigLines(is, &lines); - KALDI_ASSERT(lines.size() == 8); - - ConfigLine cfl; - for (size_t i = 0; i < lines.size(); i++) { - KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b"); - if (i == 1) { - KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c"); - } - if (i == 4) { - KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1"); - } - if (i == 5) { - BaseFloat float_val = 0; - KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15)); - } - if (i == 6) { - KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c"); - } - if (i == 7) { - KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f"); - } - } -} void UnitTestDescriptorTokenize() { std::vector lines; @@ -281,8 +94,6 @@ int main() { using namespace kaldi; using namespace kaldi::nnet3; - UnitTestConfigLineParse(); - UnitTestReadConfig(); UnitTestDescriptorTokenize(); UnitTestSummarizeVector(); UnitTestNameMatchesPattern(); diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index a51bba21484..17dec23e7c1 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -27,353 +27,6 @@ namespace kaldi { namespace nnet3 { - -bool ConfigLine::ParseLine(const std::string &line) { - data_.clear(); - whole_line_ = line; - if (line.size() == 0) return false; // Empty line - size_t pos = 0, size = line.size(); - while (isspace(line[pos]) && pos < size) pos++; - if (pos == size) - return false; // whitespace-only line - size_t first_token_start_pos = pos; - // first get first_token_. - while (!isspace(line[pos]) && pos < size) { - if (line[pos] == '=') { - // If the first block of non-whitespace looks like "foo-bar=...", - // then we ignore it: there is no initial token, and FirstToken() - // is empty. - pos = first_token_start_pos; - break; - } - pos++; - } - first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos); - // first_token_ is expected to be either empty or something like - // "component-node", which actually is a slightly more restrictive set of - // strings than IsValidName() checks for this is a convenient way to check it. - if (!first_token_.empty() && !IsValidName(first_token_)) - return false; - - while (pos < size) { - if (isspace(line[pos])) { - pos++; - continue; - } - - // OK, at this point we know that we are pointing at nonspace. - size_t next_equals_sign = line.find_first_of("=", pos); - if (next_equals_sign == pos || next_equals_sign == std::string::npos) { - // we're looking for something like 'key=value'. If there is no equals sign, - // or it's not preceded by something, it's a parsing failure. - return false; - } - std::string key(line, pos, next_equals_sign - pos); - if (!IsValidName(key)) return false; - - // handle any quotes. we support key='blah blah' or key="foo bar". - // no escaping is supported. - if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') { - char my_quote = line[next_equals_sign+1]; - size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); - if (next_quote == std::string::npos) { // no matching quote was found. - KALDI_WARN << "No matching quote for " << my_quote << " in config line '" - << line << "'"; - return false; - } else { - std::string value(line, next_equals_sign + 2, - next_quote - next_equals_sign - 2); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = next_quote + 1; - continue; - } - } else { - // we want to be able to parse something like "... input=Offset(a, -1) foo=bar": - // in general, config values with spaces in them, even without quoting. - - size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1), - terminating_space = size; - - if (next_next_equals_sign != std::string::npos) { // found a later equals sign. - size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign); - if (preceding_space != std::string::npos && - preceding_space > next_equals_sign) - terminating_space = preceding_space; - } - while (isspace(line[terminating_space - 1]) && terminating_space > 0) - terminating_space--; - - std::string value(line, next_equals_sign + 1, - terminating_space - (next_equals_sign + 1)); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = terminating_space; - } - } - return true; -} - -bool ConfigLine::GetValue(const std::string &key, std::string *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - *value = (it->second).first; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToReal((it->second).first, value)) - return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, int32 *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToInteger((it->second).first, value)) - return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, std::vector *value) { - KALDI_ASSERT(value != NULL); - value->clear(); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { - // KALDI_WARN << "Bad option " << (it->second).first; - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, bool *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if ((it->second).first.size() == 0) return false; - switch (((it->second).first)[0]) { - case 'F': - case 'f': - *value = false; - break; - case 'T': - case 't': - *value = true; - break; - default: - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::HasUnusedValues() const { - std::map >::const_iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) return true; - } - return false; -} - -std::string ConfigLine::UnusedValues() const { - std::string unused_str; - std::map >::const_iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) { - if (unused_str == "") - unused_str = it->first + "=" + (it->second).first; - else - unused_str += " " + it->first + "=" + (it->second).first; - } - } - return unused_str; -} - -// This is like ExpectToken but for two tokens, and it -// will either accept token1 and then token2, or just token2. -// This is useful in Read functions where the first token -// may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, - const std::string &token2) { - KALDI_ASSERT(token1 != token2); - std::string temp; - ReadToken(is, binary, &temp); - if (temp == token1) { - ExpectToken(is, binary, token2); - } else { - if (temp != token2) { - KALDI_ERR << "Expecting token " << token1 << " or " << token2 - << " but got " << temp; - } - } -} - -// static -bool ParseFromString(const std::string &name, std::string *string, - int32 *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - if (!ConvertStringToInteger(split_string[i].substr(len), param)) - KALDI_ERR << "Bad option " << split_string[i]; - *string = ""; - // Set "string" to all the pieces but the one we used. - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - -bool ParseFromString(const std::string &name, std::string *string, - bool *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - std::string b = split_string[i].substr(len); - if (b.empty()) - KALDI_ERR << "Bad option " << split_string[i]; - if (b[0] == 'f' || b[0] == 'F') *param = false; - else if (b[0] == 't' || b[0] == 'T') *param = true; - else - KALDI_ERR << "Bad option " << split_string[i]; - *string = ""; - // Set "string" to all the pieces but the one we used. - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - -bool ParseFromString(const std::string &name, std::string *string, - BaseFloat *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - if (!ConvertStringToReal(split_string[i].substr(len), param)) - KALDI_ERR << "Bad option " << split_string[i]; - *string = ""; - // Set "string" to all the pieces but the one we used. - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - -bool ParseFromString(const std::string &name, std::string *string, - std::string *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - *param = split_string[i].substr(len); - - // Set "string" to all the pieces but the one we used. - *string = ""; - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - -bool ParseFromString(const std::string &name, std::string *string, - std::vector *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - if (!SplitStringToIntegers(split_string[i].substr(len), ":,", - false, param)) - KALDI_ERR << "Bad option " << split_string[i]; - *string = ""; - // Set "string" to all the pieces but the one we used. - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - bool DescriptorTokenize(const std::string &input, std::vector *tokens) { KALDI_ASSERT(tokens != NULL); @@ -422,32 +75,6 @@ bool DescriptorTokenize(const std::string &input, return true; } -bool IsValidName(const std::string &name) { - if (name.size() == 0) return false; - for (size_t i = 0; i < name.size(); i++) { - if (i == 0 && !isalpha(name[i]) && name[i] != '_') - return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') - return false; - } - return true; -} - -void ReadConfigLines(std::istream &is, - std::vector *lines) { - KALDI_ASSERT(lines != NULL); - std::string line; - while (std::getline(is, line)) { - if (line.size() == 0) continue; - size_t start = line.find_first_not_of(" \t"); - size_t end = line.find_first_of('#'); - if (start == std::string::npos || start == end) continue; - end = line.find_last_not_of(" \t", end - 1); - KALDI_ASSERT(end >= start); - lines->push_back(line.substr(start, end - start + 1)); - } -} - std::string ErrorContext(std::istream &is) { if (!is.good()) return "end of line"; char buf[21]; diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h index a073a54f7e0..0fc19d51f6c 100644 --- a/src/nnet3/nnet-parse.h +++ b/src/nnet3/nnet-parse.h @@ -26,103 +26,6 @@ namespace kaldi { namespace nnet3 { -/** - This class is responsible for parsing input like - hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e" - and giving you access to the fields, in this case - - FirstToken() == "hi-there", and key->value pairs: - - xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", - bing->"a b c", baz->"a b c d='a b' e" - - The first token is optional, if the line started with a key-value pair then - FirstValue() will be empty. - - Note: it can parse value fields with space inside them only if they are free of the '=' - character. If values are going to contain the '=' character, you need to quote them - with either single or double quotes. - - Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. - */ -class ConfigLine { - public: - // Tries to parse the line as a config-file line. Returns false - // if it could not for some reason, e.g. parsing failure. In most cases - // prints no warnings; the user should do this. Does not expect comments. - bool ParseLine(const std::string &line); - - // the GetValue functions are overloaded for various types. They return true - // if the key exists with value that can be converted to that type, and false - // otherwise. They also mark the key-value pair as having been read. It is - // not an error to read values twice. - bool GetValue(const std::string &key, std::string *value); - bool GetValue(const std::string &key, BaseFloat *value); - bool GetValue(const std::string &key, int32 *value); - // Values may be separated by ":" or by ",". - bool GetValue(const std::string &key, std::vector *value); - bool GetValue(const std::string &key, bool *value); - - bool HasUnusedValues() const; - /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one - /// of the GetValue() functions. - std::string UnusedValues() const; - - const std::string &FirstToken() const { return first_token_; } - - const std::string WholeLine() { return whole_line_; } - // use default assignment operator and copy constructor. - private: - std::string whole_line_; - // the first token of the line, e.g. if line is - // foo-bar baz=bing - // then first_token_ would be "foo-bar". - std::string first_token_; - - // data_ maps from key to (value, is-this-value-consumed?). - std::map > data_; - -}; - -// Note: the ParseFromString functions are to be removed after we switch over to -// using the ConfigLine mechanism. - - -/// \file nnet-parse.h -/// This header contains a few parsing-related functions that are used -/// while reading parsing neural network files and config files. - -/// Function used in Init routines. Suppose name=="foo", if "string" has a -/// field like foo=12, this function will set "param" to 12 and remove that -/// element from "string". It returns true if the parameter was read. -bool ParseFromString(const std::string &name, std::string *string, - int32 *param); - -/// This version of ParseFromString is for parameters of type BaseFloat. -bool ParseFromString(const std::string &name, std::string *string, - BaseFloat *param); - -/// This version of ParseFromString is for parameters of type bool, which can -/// appear as any string beginning with f, F, t or T. -bool ParseFromString(const std::string &name, std::string *string, - bool *param); - -/// This version of ParseFromString is for parsing strings. (these -/// should not contain space). -bool ParseFromString(const std::string &name, std::string *string, - std::string *param); - -/// This version of ParseFromString handles colon-separated or comma-separated -/// lists of integers. -bool ParseFromString(const std::string &name, std::string *string, - std::vector *param); - -/// This function is like ExpectToken but for two tokens, and it will either -/// accept token1 and then token2, or just token2. This is useful in Read -/// functions where the first token may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, - const std::string &token2); /** This function tokenizes input when parsing Descriptor configuration @@ -142,32 +45,6 @@ void ExpectOneOrTwoTokens(std::istream &is, bool binary, bool DescriptorTokenize(const std::string &input, std::vector *tokens); -/// Returns true if 'name' would be a valid name for a component or node in a -/// Nnet. This is a nonempty string beginning with A-Za-z_, and containing only -/// '-', '_', '.', A-Z, a-z, or 0-9. -bool IsValidName(const std::string &name); - - -/** - This function reads in a config file and *appends* its contents to a vector of - lines; it is responsible for removing comments (anything after '#') and - stripping out any lines that contain only whitespace after comment removal. - */ -void ReadConfigLines(std::istream &is, - std::vector *lines); - - -/** - This function converts config-lines from a simple sequence of strings - as output by ReadConfigLines(), into a sequence of first-tokens and - name-value pairs. The general format is: - "command-type bar=baz xx=yyy" - etc., although there are subtleties as to what exactly is allowed, see - documentation for class ConfigLine for details. - This function will die if there was a parsing failure. - */ -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines); /* Returns true if name 'name' matches pattern 'pattern'. The pattern diff --git a/src/transform/Makefile b/src/transform/Makefile index 67e5b78fb10..194f362f11a 100644 --- a/src/transform/Makefile +++ b/src/transform/Makefile @@ -4,14 +4,12 @@ include ../kaldi.mk TESTFILES = regtree-fmllr-diag-gmm-test lda-estimate-test \ regression-tree-test fmllr-diag-gmm-test \ - regtree-mllr-diag-gmm-test fmpe-test fmllr-raw-test \ - differentiable-fmllr-test + regtree-mllr-diag-gmm-test fmpe-test fmllr-raw-test OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \ regtree-fmllr-diag-gmm.o cmvn.o transform-common.o fmllr-diag-gmm.o \ lvtln.o mllt.o fmpe.o basis-fmllr-diag-gmm.o \ - compressed-transform-stats.o fmllr-raw.o decodable-am-diag-gmm-regtree.o \ - differentiable-fmllr.o + compressed-transform-stats.o fmllr-raw.o decodable-am-diag-gmm-regtree.o LIBNAME = kaldi-transform diff --git a/src/util/text-utils-test.cc b/src/util/text-utils-test.cc index 5bfe4cb24d0..3b58f4f1dd1 100644 --- a/src/util/text-utils-test.cc +++ b/src/util/text-utils-test.cc @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation // 2017 Johns Hopkins University (author: Daniel Povey) +// 2015 Vimal Manohar (Johns Hopkins University) // See ../../COPYING for clarification regarding multiple authors // @@ -324,6 +325,193 @@ void TestStringsApproxEqual() { KALDI_ASSERT(!StringsApproxEqual("x 1.0 y", "x 1.0001 y", 4)); } +void UnitTestConfigLineParse() { + std::string str; + { + ConfigLine cfl; + str = "a-b xx=yyy foo=bar baz=123 ba=1:2"; + bool status = cfl.ParseLine(str); + KALDI_ASSERT(status && cfl.FirstToken() == "a-b"); + + KALDI_ASSERT(cfl.HasUnusedValues()); + std::string str_value; + KALDI_ASSERT(cfl.GetValue("xx", &str_value)); + KALDI_ASSERT(str_value == "yyy"); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(cfl.GetValue("foo", &str_value)); + KALDI_ASSERT(str_value == "bar"); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(!cfl.GetValue("xy", &str_value)); + KALDI_ASSERT(cfl.GetValue("baz", &str_value)); + KALDI_ASSERT(str_value == "123"); + + std::vector int_values; + KALDI_ASSERT(!cfl.GetValue("xx", &int_values)); + KALDI_ASSERT(cfl.GetValue("baz", &int_values)); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123); + KALDI_ASSERT(cfl.GetValue("ba", &int_values)); + KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2); + KALDI_ASSERT(!cfl.HasUnusedValues()); + } + + { + ConfigLine cfl; + str = "a-b baz=x y z pp = qq ab =cd ac= bd"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "a-b baz=x y z pp = qq ab=cd ac=bd"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "foo-bar"; + KALDI_ASSERT(cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "foo-bar a=b c d f=g"; + std::string value; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" && + cfl.GetValue("a", &value) && value == "b c d" && + cfl.GetValue("f", &value) && value == "g" && + !cfl.HasUnusedValues()); + } + { + ConfigLine cfl; + str = "zzz a=b baz"; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" && + cfl.UnusedValues() == "a=b baz"); + } + { + ConfigLine cfl; + str = "xxx a=b baz "; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz"); + } + { + ConfigLine cfl; + str = "xxx a=b =c"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "xxx baz='x y z' pp=qq ab=cd ac=bd"; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx"); + std::string str_value; + KALDI_ASSERT(cfl.GetValue("baz", &str_value)); + KALDI_ASSERT(str_value == "x y z"); + KALDI_ASSERT(cfl.GetValue("pp", &str_value)); + KALDI_ASSERT(str_value == "qq"); + KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd"); + KALDI_ASSERT(cfl.GetValue("ab", &str_value)); + KALDI_ASSERT(str_value == "cd"); + KALDI_ASSERT(cfl.UnusedValues() == "ac=bd"); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(cfl.GetValue("ac", &str_value)); + KALDI_ASSERT(str_value == "bd"); + KALDI_ASSERT(!cfl.HasUnusedValues()); + } + + { + ConfigLine cfl; + str = "x baz= pp = qq flag=t "; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = " x baz= pp=qq flag=t "; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x"); + + std::string str_value; + KALDI_ASSERT(cfl.GetValue("baz", &str_value)); + KALDI_ASSERT(str_value == ""); + KALDI_ASSERT(cfl.GetValue("pp", &str_value)); + KALDI_ASSERT(str_value == "qq"); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(cfl.GetValue("flag", &str_value)); + KALDI_ASSERT(str_value == "t"); + KALDI_ASSERT(!cfl.HasUnusedValues()); + + bool bool_value = false; + KALDI_ASSERT(cfl.GetValue("flag", &bool_value)); + KALDI_ASSERT(bool_value); + } + + { + ConfigLine cfl; + str = "xx _baz=a -pp=qq"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "xx 0baz=a pp=qq"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "xx -baz=a pp=qq"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "xx _baz'=a pp=qq"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = " baz=g"; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == ""); + bool flag; + KALDI_ASSERT(!cfl.GetValue("baz", &flag)); + } + { + ConfigLine cfl; + str = "xx _baz1=a pp=qq"; + KALDI_ASSERT(cfl.ParseLine(str)); + + std::string str_value; + KALDI_ASSERT(cfl.GetValue("_baz1", &str_value)); + } +} + +void UnitTestReadConfig() { + std::string str = "a-b alpha=aa beta=\"b b\"# String test\n" + "a-b beta2='b c' beta3=bd # \n" + "a-b gamma=1:2:3:4 # Int Vector test\n" + " a-b de1ta=f # Bool + Integer in key Comment test delta=t \n" + "a-b _epsilon=-1 # Int Vector test _epsilon=1 \n" + "a-b zet-_a=0.15 theta=1.1# Float, -, _ test\n" + "a-b quoted='a b c' # quoted string\n" + "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes"; + + std::istringstream is(str); + std::vector lines; + ReadConfigLines(is, &lines); + KALDI_ASSERT(lines.size() == 8); + + ConfigLine cfl; + for (size_t i = 0; i < lines.size(); i++) { + KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b"); + if (i == 1) { + KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c"); + } + if (i == 4) { + KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1"); + } + if (i == 5) { + BaseFloat float_val = 0; + KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15)); + } + if (i == 6) { + KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c"); + } + if (i == 7) { + KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f"); + } + } +} } // end namespace kaldi @@ -344,5 +532,7 @@ int main() { TestNan(); TestInf(); TestInf(); + UnitTestConfigLineParse(); + UnitTestReadConfig(); std::cout << "Test OK\n"; } diff --git a/src/util/text-utils.cc b/src/util/text-utils.cc index 200e3ad9327..bbf38ecc5cc 100644 --- a/src/util/text-utils.cc +++ b/src/util/text-utils.cc @@ -340,4 +340,252 @@ bool StringsApproxEqual(const std::string &a, } +bool ConfigLine::ParseLine(const std::string &line) { + data_.clear(); + whole_line_ = line; + if (line.size() == 0) return false; // Empty line + size_t pos = 0, size = line.size(); + while (isspace(line[pos]) && pos < size) pos++; + if (pos == size) + return false; // whitespace-only line + size_t first_token_start_pos = pos; + // first get first_token_. + while (!isspace(line[pos]) && pos < size) { + if (line[pos] == '=') { + // If the first block of non-whitespace looks like "foo-bar=...", + // then we ignore it: there is no initial token, and FirstToken() + // is empty. + pos = first_token_start_pos; + break; + } + pos++; + } + first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos); + // first_token_ is expected to be either empty or something like + // "component-node", which actually is a slightly more restrictive set of + // strings than IsValidName() checks for this is a convenient way to check it. + if (!first_token_.empty() && !IsValidName(first_token_)) + return false; + + while (pos < size) { + if (isspace(line[pos])) { + pos++; + continue; + } + + // OK, at this point we know that we are pointing at nonspace. + size_t next_equals_sign = line.find_first_of("=", pos); + if (next_equals_sign == pos || next_equals_sign == std::string::npos) { + // we're looking for something like 'key=value'. If there is no equals sign, + // or it's not preceded by something, it's a parsing failure. + return false; + } + std::string key(line, pos, next_equals_sign - pos); + if (!IsValidName(key)) return false; + + // handle any quotes. we support key='blah blah' or key="foo bar". + // no escaping is supported. + if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') { + char my_quote = line[next_equals_sign+1]; + size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); + if (next_quote == std::string::npos) { // no matching quote was found. + KALDI_WARN << "No matching quote for " << my_quote << " in config line '" + << line << "'"; + return false; + } else { + std::string value(line, next_equals_sign + 2, + next_quote - next_equals_sign - 2); + data_.insert(std::make_pair(key, std::make_pair(value, false))); + pos = next_quote + 1; + continue; + } + } else { + // we want to be able to parse something like "... input=Offset(a, -1) foo=bar": + // in general, config values with spaces in them, even without quoting. + + size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1), + terminating_space = size; + + if (next_next_equals_sign != std::string::npos) { // found a later equals sign. + size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign); + if (preceding_space != std::string::npos && + preceding_space > next_equals_sign) + terminating_space = preceding_space; + } + while (isspace(line[terminating_space - 1]) && terminating_space > 0) + terminating_space--; + + std::string value(line, next_equals_sign + 1, + terminating_space - (next_equals_sign + 1)); + data_.insert(std::make_pair(key, std::make_pair(value, false))); + pos = terminating_space; + } + } + return true; +} + +bool ConfigLine::GetValue(const std::string &key, std::string *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + *value = (it->second).first; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!ConvertStringToReal((it->second).first, value)) + return false; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, int32 *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!ConvertStringToInteger((it->second).first, value)) + return false; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, std::vector *value) { + KALDI_ASSERT(value != NULL); + value->clear(); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { + // KALDI_WARN << "Bad option " << (it->second).first; + return false; + } + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, bool *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if ((it->second).first.size() == 0) return false; + switch (((it->second).first)[0]) { + case 'F': + case 'f': + *value = false; + break; + case 'T': + case 't': + *value = true; + break; + default: + return false; + } + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::HasUnusedValues() const { + std::map >::const_iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (!(it->second).second) return true; + } + return false; +} + +std::string ConfigLine::UnusedValues() const { + std::string unused_str; + std::map >::const_iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (!(it->second).second) { + if (unused_str == "") + unused_str = it->first + "=" + (it->second).first; + else + unused_str += " " + it->first + "=" + (it->second).first; + } + } + return unused_str; +} + +// This is like ExpectToken but for two tokens, and it +// will either accept token1 and then token2, or just token2. +// This is useful in Read functions where the first token +// may already have been consumed. +void ExpectOneOrTwoTokens(std::istream &is, bool binary, + const std::string &token1, + const std::string &token2) { + KALDI_ASSERT(token1 != token2); + std::string temp; + ReadToken(is, binary, &temp); + if (temp == token1) { + ExpectToken(is, binary, token2); + } else { + if (temp != token2) { + KALDI_ERR << "Expecting token " << token1 << " or " << token2 + << " but got " << temp; + } + } +} + + +bool IsValidName(const std::string &name) { + if (name.size() == 0) return false; + for (size_t i = 0; i < name.size(); i++) { + if (i == 0 && !isalpha(name[i]) && name[i] != '_') + return false; + if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') + return false; + } + return true; +} + +void ReadConfigLines(std::istream &is, + std::vector *lines) { + KALDI_ASSERT(lines != NULL); + std::string line; + while (std::getline(is, line)) { + if (line.size() == 0) continue; + size_t start = line.find_first_not_of(" \t"); + size_t end = line.find_first_of('#'); + if (start == std::string::npos || start == end) continue; + end = line.find_last_not_of(" \t", end - 1); + KALDI_ASSERT(end >= start); + lines->push_back(line.substr(start, end - start + 1)); + } +} + +void ParseConfigLines(const std::vector &lines, + std::vector *config_lines) { + config_lines->resize(lines.size()); + for (size_t i = 0; i < lines.size(); i++) { + bool ret = (*config_lines)[i].ParseLine(lines[i]); + if (!ret) { + KALDI_ERR << "Error parsing config line: " << lines[i]; + } + } +} + + } // end namespace kaldi diff --git a/src/util/text-utils.h b/src/util/text-utils.h index 7bc20957672..02f4bf483fc 100644 --- a/src/util/text-utils.h +++ b/src/util/text-utils.h @@ -183,6 +183,98 @@ bool StringsApproxEqual(const std::string &a, const std::string &b, int32 decimal_places_check = 2); +/** + This class is responsible for parsing input like + hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e" + and giving you access to the fields, in this case + + FirstToken() == "hi-there", and key->value pairs: + + xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", + bing->"a b c", baz->"a b c d='a b' e" + + The first token is optional, if the line started with a key-value pair then + FirstValue() will be empty. + + Note: it can parse value fields with space inside them only if they are free of the '=' + character. If values are going to contain the '=' character, you need to quote them + with either single or double quotes. + + Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. + */ +class ConfigLine { + public: + // Tries to parse the line as a config-file line. Returns false + // if it could not for some reason, e.g. parsing failure. In most cases + // prints no warnings; the user should do this. Does not expect comments. + bool ParseLine(const std::string &line); + + // the GetValue functions are overloaded for various types. They return true + // if the key exists with value that can be converted to that type, and false + // otherwise. They also mark the key-value pair as having been read. It is + // not an error to read values twice. + bool GetValue(const std::string &key, std::string *value); + bool GetValue(const std::string &key, BaseFloat *value); + bool GetValue(const std::string &key, int32 *value); + // Values may be separated by ":" or by ",". + bool GetValue(const std::string &key, std::vector *value); + bool GetValue(const std::string &key, bool *value); + + bool HasUnusedValues() const; + /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one + /// of the GetValue() functions. + std::string UnusedValues() const; + + const std::string &FirstToken() const { return first_token_; } + + const std::string WholeLine() { return whole_line_; } + // use default assignment operator and copy constructor. + private: + std::string whole_line_; + // the first token of the line, e.g. if line is + // foo-bar baz=bing + // then first_token_ would be "foo-bar". + std::string first_token_; + + // data_ maps from key to (value, is-this-value-consumed?). + std::map > data_; + +}; + +/// This function is like ExpectToken but for two tokens, and it will either +/// accept token1 and then token2, or just token2. This is useful in Read +/// functions where the first token may already have been consumed. +void ExpectOneOrTwoTokens(std::istream &is, bool binary, + const std::string &token1, + const std::string &token2); + + +/** + This function reads in a config file and *appends* its contents to a vector of + lines; it is responsible for removing comments (anything after '#') and + stripping out any lines that contain only whitespace after comment removal. + */ +void ReadConfigLines(std::istream &is, + std::vector *lines); + + +/** + This function converts config-lines from a simple sequence of strings + as output by ReadConfigLines(), into a sequence of first-tokens and + name-value pairs. The general format is: + "command-type bar=baz xx=yyy" + etc., although there are subtleties as to what exactly is allowed, see + documentation for class ConfigLine for details. + This function will die if there was a parsing failure. + */ +void ParseConfigLines(const std::vector &lines, + std::vector *config_lines); + + +/// Returns true if 'name' would be a valid name for a component or node in a +/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing only +/// '-', '_', '.', A-Z, a-z, or 0-9. +bool IsValidName(const std::string &name); } // namespace kaldi From 12894e787559310fd7823a5d36e8679184b859ba Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 12 Dec 2018 18:47:03 -0500 Subject: [PATCH 32/87] [src] Add testing code for transforms; fix bug in variance estimation of GaussianEstimator --- src/adapt/Makefile | 2 +- src/adapt/differentiable-fmllr-test.cc | 27 +- src/adapt/differentiable-fmllr.cc | 8 +- src/adapt/differentiable-fmllr.h | 15 +- src/adapt/differentiable-transform-itf.cc | 39 ++ src/adapt/differentiable-transform-itf.h | 25 +- src/adapt/differentiable-transform-test.cc | 685 ++++----------------- src/adapt/differentiable-transform.cc | 49 +- src/adapt/differentiable-transform.h | 9 +- src/adapt/generic-transform.h | 6 +- src/nnet3/nnet-component-itf.cc | 6 +- 11 files changed, 284 insertions(+), 587 deletions(-) diff --git a/src/adapt/Makefile b/src/adapt/Makefile index 8c8f4204802..26aa383f333 100644 --- a/src/adapt/Makefile +++ b/src/adapt/Makefile @@ -2,7 +2,7 @@ all: include ../kaldi.mk -TESTFILES = differentiable-fmllr-test +TESTFILES = differentiable-fmllr-test differentiable-transform-test OBJFILES = differentiable-fmllr.o differentiable-transform-itf.o \ generic-transform.o differentiable-transform.o diff --git a/src/adapt/differentiable-fmllr-test.cc b/src/adapt/differentiable-fmllr-test.cc index 6f001380608..86f3b924418 100644 --- a/src/adapt/differentiable-fmllr-test.cc +++ b/src/adapt/differentiable-fmllr-test.cc @@ -342,7 +342,7 @@ void TestFmllrEstimatorVarDerivs(const MatrixBase &feats, // measure the accuracy of the deriv in 10 random directions int32 n = 10; - BaseFloat epsilon = 0.01; + BaseFloat epsilon = 0.001; Vector expected_changes(n), actual_changes(n); for (int32 i = 0; i < n; i++) { Vector new_s(num_classes, kUndefined); @@ -367,6 +367,30 @@ void TestFmllrEstimatorVarDerivs(const MatrixBase &feats, } +void TestFmllrEstimatorSequence(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + // Do two fMLLR's in a row and see if the change in objf decreases. + + int32 T = feats.NumRows(), dim = feats.NumCols(); + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (first time) is " + << objf_impr; + + + Matrix adapted_feats2(T, dim, kUndefined); + FmllrEstimator f2(opts, mu, s); + BaseFloat objf_impr2 = f.ForwardCombined(adapted_feats, post, &adapted_feats2); + KALDI_LOG << "Forward objf-impr per frame (second time) is " + << objf_impr2; +} void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, const Posterior &post, @@ -580,6 +604,7 @@ void UnitTestGaussianAndEstimators() { } { // test FmllrEstimator + TestFmllrEstimatorSequence(feats, post, g); TestFmllrEstimatorMeanDerivs(feats, post, g); TestFmllrEstimatorFeatDerivs(feats, post, g); TestFmllrEstimatorVarDerivs(feats, post, g); diff --git a/src/adapt/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc index f19b7c00e51..9d43a465f65 100644 --- a/src/adapt/differentiable-fmllr.cc +++ b/src/adapt/differentiable-fmllr.cc @@ -272,7 +272,7 @@ void GaussianEstimator::Estimate(const FmllrEstimatorOptions &opts) { variance_sharing_weight_ <= 1.0); KALDI_ASSERT(mu_.NumRows() == 0 && "You cannot call Estimate() twice."); - int32 num_classes = m_.NumRows(); + int32 num_classes = m_.NumRows(), dim = m_.NumCols(); mu_ = m_; s_.Resize(num_classes, kUndefined); @@ -287,7 +287,7 @@ void GaussianEstimator::Estimate(const FmllrEstimatorOptions &opts) { // We already copied m_ to mu_. mu_i.Scale(1.0 / gamma_i); s_(i) = std::max(variance_floor_, - v_(i) / gamma_i - VecVec(mu_i, mu_i)); + v_(i) / (gamma_i * dim) - VecVec(mu_i, mu_i) / dim); } } @@ -331,8 +331,8 @@ void GaussianEstimator::AddToOutputDerivs( if (gamma_i != 0.0) { if (s_(i) != variance_floor) { BaseFloat s_bar_i = (BaseFloat(1.0) - f) * t_bar(i) + s_bar * gamma_i / gamma; - v_bar_(i) += s_bar_i / gamma_i; - m_bar_i.AddVec(-2.0 * s_bar_i / gamma_i, mu_.Row(i)); + v_bar_(i) += s_bar_i / (gamma_i * dim); + m_bar_i.AddVec(-2.0 * s_bar_i / (gamma_i * dim), mu_.Row(i)); } m_bar_i.AddVec(1.0 / gamma_i, mu_bar.Row(i)); } diff --git a/src/adapt/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h index a1a7b22d451..d67519e57c2 100644 --- a/src/adapt/differentiable-fmllr.h +++ b/src/adapt/differentiable-fmllr.h @@ -278,6 +278,8 @@ class GaussianEstimator { public: GaussianEstimator(int32 num_classes, int32 feature_dim); + GaussianEstimator(const GaussianEstimator &other) = default; + int32 NumClasses() const { return gamma_.Dim(); } int32 Dim() const; @@ -370,24 +372,23 @@ class GaussianEstimator { The estimation procedure is: \mu_i = \frac{m_i}{\gamma_i}, or 0 if \gamma_i is 0. s_i = variance_floor if \gamma_i = 0, else: - max(variance_floor, v_i/\gamma_i - \mu_i^T \mu_i) - and another form more convenient for backprop: + max(variance_floor, (v_i/\gamma_i - \mu_i^T \mu_i) / dim) + where dim is the feature dimension; and another form more convenient for backprop: = variance_floor if \gamma_i = 0, else: - max(variance_floor, v_i/\gamma_i - m_i^T m_i / \gamma_i^2) - + max(variance_floor, v_i/(dim * \gamma_i) - m_i^T m_i / (dim * \gamma_i^2)) We write \bar{foo} for a derivative of the objective function w.r.t. foo. We are provided by the user with with \bar{\mu}_i and \bar{s}_i, when they call SetOutputDerivs(); and we aim to compute \bar{m}_i and \bar{v}_i, which are the derivs w.r.t. the raw statistics. This is done as follows: \bar{m}_i = 0 if \gamma_i is 0, otherwise: - \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i m_i}{\gamma_i^2} + \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i m_i}{dim \gamma_i^2} if s_i > variance_floor, else 0) = or 0 if \gamma_i is 0, otherwise: - \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i \mu_i}{\gamma_i} + \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i \mu_i}{dim \gamma_i} if s_i > variance_floor, else 0) \bar{v}_i = 0 if \gamma_i is 0 or s_i equals variance_floor, otherwise: - \frac{\bar{s}_i}{\gamma_i} + \frac{\bar{s}_i}{dim * \gamma_i} \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t) diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc index 7c467cb8394..684718a06be 100644 --- a/src/adapt/differentiable-transform-itf.cc +++ b/src/adapt/differentiable-transform-itf.cc @@ -19,6 +19,7 @@ #include "adapt/differentiable-transform-itf.h" #include "adapt/generic-transform.h" +#include "adapt/differentiable-transform.h" namespace kaldi { namespace differentiable_transform { @@ -43,8 +44,24 @@ DifferentiableTransform* DifferentiableTransform::ReadNew( // static DifferentiableTransform* DifferentiableTransform::NewTransformOfType( const std::string &type) { + if (type.size() > 2 && type[type.size() - 1] == '>') { + std::string new_type(type); + if (new_type[0] == '<') + new_type.erase(0, 1); // erase "<" + new_type.erase(new_type.size() - 1); // erase ">". + return NewTransformOfType(new_type); + } + if (type == "NoOpTransform") { return new NoOpTransform(); + } else if (type == "FmllrTransform") { + return new FmllrTransform(); + } else if (type == "MeanOnlyTransform") { + return new MeanOnlyTransform(); + } else if (type == "SequenceTransform") { + return new SequenceTransform(); + } else if (type == "AppendTransform") { + return new AppendTransform(); } else { // Calling code will throw an error. return NULL; @@ -96,6 +113,28 @@ void DifferentiableTransform::TestingForwardBatch( output->CopyFromMat(output_cpu); } +// static +DifferentiableTransform* DifferentiableTransform::ReadFromConfig( + std::istream &is, int32 num_classes) { + std::vector lines; + ReadConfigLines(is, &lines); + std::vector config_lines; + ParseConfigLines(lines, &config_lines); + if (config_lines.empty()) + KALDI_ERR << "Config file is empty."; + std::string transform_type = config_lines[0].FirstToken(); + DifferentiableTransform *transform = NewTransformOfType(transform_type); + if (transform == NULL) + KALDI_ERR << "Parsing config file, could not find transform of type " + << transform_type; + int32 pos = transform->InitFromConfig(0, &config_lines); + if (pos != static_cast(config_lines.size())) + KALDI_ERR << "Found junk at end of config file, starting with line " + << pos << ": " << config_lines[pos].WholeLine(); + KALDI_ASSERT(num_classes > 0); + transform->SetNumClasses(num_classes); + return transform; +} } // namespace differentiable_transform } // namespace kaldi diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index b3595434458..69f56daa17f 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -42,7 +42,7 @@ class SpeakerStatsItf { public: // Does any estimation that is required-- you call this after accumulating // stats and before calling TestingForward(). - virtual void Estimate(); + virtual void Estimate() = 0; virtual ~SpeakerStatsItf() { } }; @@ -326,17 +326,23 @@ class DifferentiableTransform { config_lines->size(), it means we're done. */ virtual int32 InitFromConfig(int32 cur_pos, - std::vector *config_lines); + std::vector *config_lines) = 0; // Returns a new transform of the given type e.g. "NoOpTransform" - // or NULL if no such component type exists. + // or NULL if no such component type exists. If angle brackets are + // present, e.g. "", this function will detect and + // remove them. static DifferentiableTransform *NewTransformOfType(const std::string &type); // Reads a differentiable transform from a config file (this function parses // the file and reads a single DifferentiableTransform object from it). Note: // since DifferentiableTransform objects can contain others, the file may - // contain many lines. - static DifferentiableTransform *ReadFromConfig(std::istream &is); + // contain many lines. Throws exception if it did not succeed-- including + // if the config file had junk at the end that was not parsed. + static DifferentiableTransform *ReadFromConfig(std::istream &is, + int32 num_classes); + + // Write transform to stream virtual void Write(std::ostream &os, bool binary) const = 0; @@ -351,15 +357,16 @@ class DifferentiableTransform { // beforehand. static DifferentiableTransform* ReadNew(std::istream &is, bool binary); + DifferentiableTransform(): num_classes_(-1) { } + + DifferentiableTransform(const DifferentiableTransform &other): + num_classes_(other.num_classes_) { } + virtual ~DifferentiableTransform() { } protected: int32 num_classes_; }; -// Attempts to read a transform -DifferentiableTransform *ReadTransformAtPosition( - int32 pos, std::vector *config_lines); - } // namespace differentiable_transform } // namespace kaldi diff --git a/src/adapt/differentiable-transform-test.cc b/src/adapt/differentiable-transform-test.cc index 419d754d764..2715c1ee4bf 100644 --- a/src/adapt/differentiable-transform-test.cc +++ b/src/adapt/differentiable-transform-test.cc @@ -36,11 +36,12 @@ void WriteRandomConfigOfDim(std::ostream &os, int32 dim) { // We give more cases to the non-recursive transforms to ensure // the expected size of the config file is finite. switch(RandInt(0, 7)) { - case 0: case 1: + case 0: os << "NoOpTransform dim=" << actual_dim << "\n"; return; - case 2: case 3: - os << "FmllrTransform dim=" << actual_dim << "\n"; + case 1: case 2: case 3: + os << "FmllrTransform dim=" << actual_dim << " smoothing-count=" + << 100.0 * RandInt(0, 2) << "\n"; return; case 4: case 5: os << "MeanOnlyTransform dim=" << actual_dim << "\n"; @@ -64,581 +65,164 @@ void WriteRandomConfigOfDim(std::ostream &os, int32 dim) { // This function writes a random configuration file to 'os'. void WriteRandomConfigFile(std::ostream &os) { - WriteRandomConfigOfDim(std::ostream &os, -1); + WriteRandomConfigOfDim(os, -1); } +void UnitTestReadFromConfig() { + using namespace kaldi; + using namespace kaldi::differentiable_transform; - -// Test derivatives produced by the Estimator object for K. -void TestCoreFmllrEstimatorKDeriv( - BaseFloat gamma, - const Matrix &G, - const Matrix &K, - const Matrix &A, - CoreFmllrEstimator *estimator) { - - int32 num_directions = 4; - Vector expected_changes(num_directions), - actual_changes(num_directions); - - int32 dim = G.NumRows(); - BaseFloat epsilon = 1.0e-03 * gamma; - Matrix A_deriv(dim, dim); - // A_deriv defines the objective function: a random linear function in A. - A_deriv.SetRandn(); - A_deriv.Add(0.1); // Introduce some asymmetry. - - Matrix G_deriv(dim, dim), - K_deriv(dim, dim); - estimator->Backward(A_deriv, &G_deriv, &K_deriv); - - for (int32 i = 0; i < num_directions; i++) { - Matrix K_new(dim, dim); - K_new.SetRandn(); - K_new.Scale(epsilon); - expected_changes(i) = TraceMatMat(K_new, K_deriv, kTrans); - K_new.AddMat(1.0, K); - FmllrEstimatorOptions opts; - Matrix A_new(dim, dim); - CoreFmllrEstimator estimator2(opts, gamma, G, K_new, &A_new); - estimator2.Forward(); - A_new.AddMat(-1.0, A); - // compute the change in our random linear objective function defined by - // A_deriv, that would be produced by taking some small random change in K - // and computing the A that results from that. - actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); - } - - KALDI_LOG << "Expected changes: " << expected_changes - << ", actual changes: " << actual_changes; - if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { - KALDI_ERR << "Expected and actual changes differ too much: " - << expected_changes << " vs. " - << actual_changes; + for (int32 i = 0; i < 100; i++) { + std::ostringstream os; + WriteRandomConfigFile(os); + std::istringstream is(os.str()); + int32 num_classes = RandInt(20, 30); + DifferentiableTransform *transform = + DifferentiableTransform::ReadFromConfig(is, num_classes); + KALDI_ASSERT(transform != NULL); + delete transform; } } -// Test derivatives produced by the Estimator object for G. -void TestCoreFmllrEstimatorGDeriv( - BaseFloat gamma, - const Matrix &G, - const Matrix &K, - const Matrix &A, - CoreFmllrEstimator *estimator) { - - int32 num_directions = 4; - Vector expected_changes(num_directions), - actual_changes(num_directions); - - int32 dim = G.NumRows(); - BaseFloat epsilon = 1.0e-03 * gamma; - Matrix A_deriv(dim, dim); - // A_deriv defines the objective function: a random linear function in A. - A_deriv.SetRandn(); - A_deriv.Add(0.1); // Introduce some asymmetry. - - Matrix G_deriv(dim, dim), - K_deriv(dim, dim); - estimator->Backward(A_deriv, &G_deriv, &K_deriv); - - KALDI_ASSERT(G_deriv.IsSymmetric()); - - for (int32 i = 0; i < num_directions; i++) { - Matrix G_new(dim, dim); - { - SpMatrix s(dim); - s.SetRandn(); - G_new.CopyFromSp(s); +// Creates a random mean per class and adds it to the features, weighted +// according to the posteriors. It makes the tests more realistic, if +// there are systematic differences between the classes. +void AddRandomMeanOffsets(BaseFloat scale, + int32 num_classes, + const Posterior &post, + CuMatrix *feats) { + int32 T = feats->NumRows(), dim = feats->NumCols(); + CuMatrix class_means(num_classes, dim); + class_means.SetRandn(); + class_means.Scale(scale); + for (int32 t = 0; t < T; t++) { + auto iter = post[t].begin(), end = post[t].end(); + BaseFloat tot_post = 0.0; + for (; iter != end; ++iter) + tot_post += iter->second; + for (iter = post[t].begin(); iter != end; ++iter) { + int32 i = iter->first; + BaseFloat p = iter->second / tot_post; + feats->Row(t).AddVec(p, class_means.Row(i)); } - G_new.Scale(epsilon); - expected_changes(i) = TraceMatMat(G_new, G_deriv, kTrans); - G_new.AddMat(1.0, G); - FmllrEstimatorOptions opts; - Matrix A_new(dim, dim); - CoreFmllrEstimator estimator2(opts, gamma, G_new, K, &A_new); - estimator2.Forward(); - A_new.AddMat(-1.0, A); - // compute the change in our random linear objective function defined by - // A_deriv, that would be produced by taking some small random change in K - // and computing the A that results from that. - actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); - } - - KALDI_LOG << "Expected changes: " << expected_changes - << ", actual changes: " << actual_changes; - if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { - KALDI_ERR << "Expected and actual changes differ too much: " - << expected_changes << " vs. " - << actual_changes; - } -} - - - -void UnitTestCoreFmllrEstimatorSimple() { - int32 dim = RandInt(10, 20); - BaseFloat gamma = RandInt(5, 10); - Matrix G(dim, dim), - K(dim, dim), A(dim, dim, kUndefined); - G.AddToDiag(1.234 * gamma); - K.AddToDiag(0.234 * gamma); - FmllrEstimatorOptions opts; - CoreFmllrEstimator estimator(opts, gamma, G, K, &A); - BaseFloat objf_impr = estimator.Forward(); - KALDI_LOG << "A is " << A; - KALDI_ASSERT(A.IsUnit(0.01)); - KALDI_ASSERT(fabs(objf_impr) < 0.01); - for (int32 i = 0; i < 5; i++) { - TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); - TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator); - } -} - -static void InitRandNonsingular(MatrixBase *M) { - do { - M->SetRandn(); - } while (M->Cond() > 50.0); -} - - -void UnitTestCoreFmllrEstimatorGeneral() { - int32 dim = RandInt(10, 20); - BaseFloat gamma = RandInt(5, 10); - Matrix G(dim, dim), - K(dim, dim), A(dim, dim, kUndefined); - - { - // make sure G is symmetric and +ve definite. - Matrix A(dim, dim + 10); - A.SetRandn(); - G.AddMatMat(gamma, A, kNoTrans, A, kTrans, 0.0); - } - - InitRandNonsingular(&K); - K.Scale(gamma); - FmllrEstimatorOptions opts; - CoreFmllrEstimator estimator(opts, gamma, G, K, &A); - BaseFloat objf_impr = estimator.Forward(); - KALDI_LOG << "A is " << A << ", objf impr is " << objf_impr; - for (int32 i = 0; i < 5; i++) { - TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); - TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator); - } -} - -void TestGaussianEstimatorDerivs(const MatrixBase &feats, - const Posterior &post, - const FmllrEstimatorOptions &opts, - GaussianEstimator *g) { - int32 n = 4; // number of delta-params we use. - Vector expected_changes(n), - actual_changes(n); - - // if !test_mean_deriv, then we test the var deriv. - bool test_mean_deriv = (RandInt(0, 1) == 0); - - int32 num_classes = g->NumClasses(), dim = g->Dim(); - - Matrix mean_derivs(num_classes, dim); - Vector var_derivs(num_classes); - if (test_mean_deriv) { - KALDI_LOG << "Testing mean derivs."; - mean_derivs.SetRandn(); - } else { - KALDI_LOG << "Testing var derivs."; - var_derivs.SetRandn(); - var_derivs.Add(0.2); // Nonzero mean makes the test easier to pass - } - g->AddToOutputDerivs(mean_derivs, var_derivs); - Matrix feats_deriv(feats.NumRows(), feats.NumCols()); - g->AccStatsBackward(feats, post, &feats_deriv); - - BaseFloat epsilon = 1.0e-04; - - for (int32 i = 0; i < n; i++) { - Matrix new_feats(feats.NumRows(), - feats.NumCols()); - new_feats.SetRandn(); - new_feats.Scale(epsilon); - - expected_changes(i) = TraceMatMat(feats_deriv, new_feats, kTrans); - - new_feats.AddMat(1.0, feats); - - GaussianEstimator g2(num_classes, dim); - g2.AccStats(new_feats, post); - g2.Estimate(opts); - - actual_changes(i) = - TraceMatMat(mean_derivs, g2.GetMeans(), kTrans) - - TraceMatMat(mean_derivs, g->GetMeans(), kTrans) + - VecVec(var_derivs, g2.GetVars()) - - VecVec(var_derivs, g->GetVars()); - } - KALDI_LOG << "Actual changes are " << actual_changes - << " vs. predicted " << expected_changes; - if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { - KALDI_ERR << "Expected and actual changes differ too much: " - << expected_changes << " vs. " - << actual_changes; } } -void TestFmllrEstimatorMeanDerivs(const MatrixBase &feats, - const Posterior &post, - const GaussianEstimator &g) { - const MatrixBase &mu(g.GetMeans()); - const VectorBase &s(g.GetVars()); - - int32 T = feats.NumRows(), dim = feats.NumCols(), - num_classes = mu.NumRows(); - - FmllrEstimatorOptions opts; - - FmllrEstimator f(opts, mu, s); - - Matrix adapted_feats(T, dim, kUndefined); - BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); - KALDI_LOG << "Forward objf-impr per frame (with same features) is " - << objf_impr; - - // adapted_feats_deriv is the deriv of a random objective function - // w.r.t the output (adapted) features. - Matrix adapted_feats_deriv(T, dim), - feats_deriv(T, dim); - adapted_feats_deriv.SetRandn(); - adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. - - f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); - - KALDI_LOG << "2-norm of adapted_feats_deriv is " - << adapted_feats_deriv.FrobeniusNorm() - << ", of feats_deriv is " - << feats_deriv.FrobeniusNorm(); - - const MatrixBase &mu_deriv = f.GetMeanDeriv(); - - // measure the accuracy of the deriv in 4 random directions. - int32 n = 4; - BaseFloat epsilon = 1.0e-04; - Vector expected_changes(n), actual_changes(n); - for (int32 i = 0; i < n; i++) { - Matrix new_mu(num_classes, dim, kUndefined), - new_adapted_feats(T, dim, kUndefined); - new_mu.SetRandn(); - // adding a systematic component helps the test to succeed in low precision. - for (int32 c = 0; c < num_classes; c++) { - new_mu.Row(c).Add(0.1 * RandInt(-1, 1)); +void GetRandomPosterior(int32 num_frames, int32 num_classes, + Posterior *post) { + post->resize(num_frames); + for (int32 t = 0; t < num_frames; t++) { + for (int32 i = 0; i < 3; i++) { + if (RandInt(0, 1) == 0) { + (*post)[t].push_back(std::pair( + RandInt(0, num_classes - 1), 0.1 + RandUniform())); + } } - new_mu.Scale(epsilon); - expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); - new_mu.AddMat(1.0, mu); - FmllrEstimator f2(opts, new_mu, s); - f2.ForwardCombined(feats, post, &new_adapted_feats); - actual_changes(i) = - TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - - TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); } - KALDI_LOG << "Expected changes are " << expected_changes - << " vs. actual " << actual_changes; - if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { - KALDI_ERR << "Expected and actual changes differ too much: " - << expected_changes << " vs. " - << actual_changes; - } -} - -void TestFmllrEstimatorVarDerivs(const MatrixBase &feats, - const Posterior &post, - const GaussianEstimator &g) { - const MatrixBase &mu(g.GetMeans()); - const VectorBase &s(g.GetVars()); - - int32 T = feats.NumRows(), dim = feats.NumCols(), - num_classes = mu.NumRows(); - - FmllrEstimatorOptions opts; - - FmllrEstimator f(opts, mu, s); - - Matrix adapted_feats(T, dim, kUndefined); - BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); - KALDI_LOG << "Forward objf-impr per frame (with same features) is " - << objf_impr; - - // adapted_feats_deriv is the deriv of a random objective function - // w.r.t the output (adapted) features. - Matrix adapted_feats_deriv(T, dim), - feats_deriv(T, dim); - adapted_feats_deriv.SetRandn(); - // Adding a systematic component to the derivative makes the test easier - // to pass, as the derivs are less random. - adapted_feats_deriv.AddMat(0.1, feats); - - f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); - KALDI_LOG << "2-norm of adapted_feats_deriv is " - << adapted_feats_deriv.FrobeniusNorm() - << ", of feats_deriv is " - << feats_deriv.FrobeniusNorm(); - - const VectorBase &s_deriv = f.GetVarDeriv(); - - // measure the accuracy of the deriv in 10 random directions - int32 n = 10; - BaseFloat epsilon = 0.01; - Vector expected_changes(n), actual_changes(n); - for (int32 i = 0; i < n; i++) { - Vector new_s(num_classes, kUndefined); - Matrix new_adapted_feats(T, dim, kUndefined); - new_s.SetRandn(); - new_s.Scale(epsilon); - expected_changes(i) = VecVec(new_s, s_deriv); - new_s.AddVec(1.0, s); - FmllrEstimator f2(opts, mu, new_s); - f2.ForwardCombined(feats, post, &new_adapted_feats); - actual_changes(i) = - TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - - TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); - } - KALDI_LOG << "Expected changes are " << expected_changes - << " vs. actual " << actual_changes; - if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { - KALDI_ERR << "Expected and actual changes differ too much: " - << expected_changes << " vs. " - << actual_changes; - } } - - -void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, - const Posterior &post, - const GaussianEstimator &g) { - int32 T = feats.NumRows(), dim = feats.NumCols(); - const MatrixBase &mu(g.GetMeans()); - const VectorBase &s(g.GetVars()); - - FmllrEstimatorOptions opts; - - FmllrEstimator f(opts, mu, s); - - Matrix adapted_feats(T, dim, kUndefined); - BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); - KALDI_LOG << "Forward objf-impr per frame (with same features) is " - << objf_impr; - - // adapted_feats_deriv is the deriv of a random objective function - // w.r.t the output (adapted) features. - Matrix adapted_feats_deriv(T, dim), - feats_deriv(T, dim); - adapted_feats_deriv.SetRandn(); - adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. - - f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); - - KALDI_LOG << "2-norm of adapted_feats_deriv is " - << adapted_feats_deriv.FrobeniusNorm() - << ", of feats_deriv is " - << feats_deriv.FrobeniusNorm(); - - // measure the accuracy of the deriv in 4 random directions. - int32 n = 4; +void TestTraining(DifferentiableTransform *transform) { + // test that the training process runs. + int32 dim = transform->Dim(), + num_classes = transform->NumClasses(), + num_frames = RandInt(200, 300), + num_spk = RandInt(2, 10), + chunks_per_spk = RandInt(1, 4), + num_rows = num_frames * num_spk * chunks_per_spk; + CuMatrix input_feats(num_rows, dim), + output_feats(num_rows, dim, kUndefined), + output_deriv(num_rows, dim, kUndefined), + input_deriv(num_rows, dim); + input_feats.SetRandn(); + output_deriv.SetRandn(); + Posterior post; + GetRandomPosterior(num_rows, num_classes, &post); + AddRandomMeanOffsets(10.0, num_classes, post, &input_feats); + + int32 num_chunks = num_spk * chunks_per_spk; + MinibatchInfoItf *info = + transform->TrainingForward(input_feats, num_chunks, num_spk, post, + &output_feats); + CuMatrix diff(input_feats); + diff.AddMat(-1.0, output_feats); + KALDI_LOG << "Difference in features (relative) is " + << (diff.FrobeniusNorm() / input_feats.FrobeniusNorm()); + + + transform->TrainingBackward(input_feats, output_deriv, num_chunks, + num_spk, post, info, &input_deriv); + + + int32 n = 5; + Vector expected_changes(n), observed_changes(n); BaseFloat epsilon = 1.0e-03; - Vector expected_changes(n), actual_changes(n); for (int32 i = 0; i < n; i++) { - Matrix new_feats(T, dim, kUndefined), - new_adapted_feats(T, dim, kUndefined); - new_feats.SetRandn(); - new_feats.Add(RandGauss()); // will help to test whether the indirect - // part of the derivative is accurate. - new_feats.Scale(epsilon); - expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); - new_feats.AddMat(1.0, feats); - FmllrEstimator f2(opts, mu, s); - f2.ForwardCombined(new_feats, post, &new_adapted_feats); - actual_changes(i) = - TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - - TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); - } - KALDI_LOG << "Expected changes are " << expected_changes - << " vs. actual " << actual_changes; - if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { - KALDI_ERR << "Expected and actual changes differ too much: " - << expected_changes << " vs. " - << actual_changes; + CuMatrix new_input_feats(num_rows, dim), + new_output_feats(num_rows, dim, kUndefined); + new_input_feats.SetRandn(); + new_input_feats.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_input_feats, input_deriv, kTrans); + new_input_feats.AddMat(1.0, input_feats); + MinibatchInfoItf *info2 = + transform->TrainingForward(new_input_feats, num_chunks, num_spk, + post, &new_output_feats); + delete info2; + new_output_feats.AddMat(-1.0, output_feats); + observed_changes(i) = TraceMatMat(new_output_feats, output_deriv, kTrans); } + KALDI_LOG << "Expected changes: " << expected_changes + << ", observed changes: " << observed_changes; + KALDI_ASSERT(expected_changes.ApproxEqual(observed_changes, 0.15)); } -void TestMeanOnlyTransformEstimatorMeanDerivs( - const MatrixBase &feats, - const Posterior &post, - const GaussianEstimator &g) { - const MatrixBase &mu(g.GetMeans()); - - int32 T = feats.NumRows(), dim = feats.NumCols(), - num_classes = mu.NumRows(); - - MeanOnlyTransformEstimator m(mu); - - Matrix adapted_feats(T, dim, kUndefined); - m.ForwardCombined(feats, post, &adapted_feats); - - // adapted_feats_deriv is the deriv of a random objective function - // w.r.t the output (adapted) features. - Matrix adapted_feats_deriv(T, dim), - feats_deriv(T, dim); - adapted_feats_deriv.SetRandn(); - adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. - - m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); - - KALDI_LOG << "2-norm of adapted_feats_deriv is " - << adapted_feats_deriv.FrobeniusNorm() - << ", of feats_deriv is " - << feats_deriv.FrobeniusNorm(); - - const MatrixBase &mu_deriv = m.GetMeanDeriv(); - - // measure the accuracy of the deriv in 4 random directions. - int32 n = 4; - BaseFloat epsilon = 1.0e-03; - Vector expected_changes(n), actual_changes(n); - for (int32 i = 0; i < n; i++) { - Matrix new_mu(num_classes, dim, kUndefined), - new_adapted_feats(T, dim, kUndefined); - new_mu.SetRandn(); - // adding a systematic component helps the test to succeed in low precision. - for (int32 c = 0; c < num_classes; c++) { - new_mu.Row(c).Add(0.1 * RandInt(-1, 1)); +void UnitTestTraining() { + for (int32 i = 0; i < 100; i++) { + std::ostringstream os; + WriteRandomConfigFile(os); + std::istringstream is(os.str()); + int32 num_classes = RandInt(20, 30); + DifferentiableTransform *transform = + DifferentiableTransform::ReadFromConfig(is, num_classes); + KALDI_LOG << "Config is: " << os.str(); + KALDI_ASSERT(transform != NULL); + if (os.str().find("smoothing-count=0") == std::string::npos) { + // Don't do this test if smoothing-count is zero: it can + // fail but it doesn't indicate a real problem. + TestTraining(transform); } - new_mu.Scale(epsilon); - expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); - new_mu.AddMat(1.0, mu); - MeanOnlyTransformEstimator m2(new_mu); - m2.ForwardCombined(feats, post, &new_adapted_feats); - actual_changes(i) = - TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - - TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); - } - KALDI_LOG << "Expected changes are " << expected_changes - << " vs. actual " << actual_changes; - if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { - KALDI_ERR << "Expected and actual changes differ too much: " - << expected_changes << " vs. " - << actual_changes; + delete transform; } } -void TestMeanOnlyTransformEstimatorFeatDerivs( - const MatrixBase &feats, - const Posterior &post, - const GaussianEstimator &g) { - int32 T = feats.NumRows(), dim = feats.NumCols(); - const MatrixBase &mu(g.GetMeans()); +void UnitTestIo() { + for (int32 i = 0; i < 100; i++) { + std::ostringstream os; + WriteRandomConfigFile(os); + std::istringstream is(os.str()); + int32 num_classes = RandInt(20, 30); + DifferentiableTransform *transform = + DifferentiableTransform::ReadFromConfig(is, num_classes); + KALDI_ASSERT(transform != NULL); + std::ostringstream os2; + bool binary = (RandInt(0,1) == 0); + transform->Write(os2, binary); - MeanOnlyTransformEstimator m(mu); + std::istringstream is2(os2.str()); - Matrix adapted_feats(T, dim, kUndefined); - m.ForwardCombined(feats, post, &adapted_feats); - - // adapted_feats_deriv is the deriv of a random objective function - // w.r.t the output (adapted) features. - Matrix adapted_feats_deriv(T, dim), - feats_deriv(T, dim); - adapted_feats_deriv.SetRandn(); - adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. - - m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); - - KALDI_LOG << "2-norm of adapted_feats_deriv is " - << adapted_feats_deriv.FrobeniusNorm() - << ", of feats_deriv is " - << feats_deriv.FrobeniusNorm(); - - // measure the accuracy of the deriv in 4 random directions. - int32 n = 4; - BaseFloat epsilon = 1.0e-03; - Vector expected_changes(n), actual_changes(n); - for (int32 i = 0; i < n; i++) { - Matrix new_feats(T, dim, kUndefined), - new_adapted_feats(T, dim, kUndefined); - new_feats.SetRandn(); - new_feats.Scale(epsilon); - expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); - new_feats.AddMat(1.0, feats); - MeanOnlyTransformEstimator m2(mu); - m2.ForwardCombined(new_feats, post, &new_adapted_feats); - actual_changes(i) = - TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - - TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); - } - KALDI_LOG << "Expected changes are " << expected_changes - << " vs. actual " << actual_changes; - if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { - KALDI_ERR << "Expected and actual changes differ too much: " - << expected_changes << " vs. " - << actual_changes; - } -} - - -void UnitTestGaussianAndEstimators() { - // It's important that the number of classes be greater than the dimension, or - // we would get a low-rank K. - int32 num_classes = RandInt(30, 40), - dim = RandInt(10, 20), - num_frames = RandInt(20 * num_classes, 40 * num_classes); - - GaussianEstimator g(num_classes, dim); - - Matrix feats(num_frames, dim); - feats.SetRandn(); - feats.Add(0.2); // Nonzero offset tests certain aspects of the code better. - Posterior post(num_frames); - for (int32 t = 0; t < num_frames; t++) { - int32 n = RandInt(0, 2); - for (int32 j = 0; j < n; j++) { - int32 i = RandInt(0, num_classes - 1); - BaseFloat p = 0.25 * RandInt(1, 5); - post[t].push_back(std::pair(i, p)); - } - } - g.AccStats(feats, post); - FmllrEstimatorOptions opts; - // avoid setting variance_sharing_weight to 1.0; it's hard for the tests to - // succeed then, and there are valid reasons for that - opts.variance_sharing_weight = 0.25 * RandInt(0, 2); - g.Estimate(opts); - KALDI_LOG << "Means are: " - << g.GetMeans() << ", vars are: " - << g.GetVars(); - - TestGaussianEstimatorDerivs(feats, post, opts, &g); - - if (RandInt(0, 1) == 0) { - opts.smoothing_count = 500.0; - } - - { // test FmllrEstimator - TestFmllrEstimatorMeanDerivs(feats, post, g); - TestFmllrEstimatorFeatDerivs(feats, post, g); - TestFmllrEstimatorVarDerivs(feats, post, g); - } - - { // test MeanOnlyTransformEstimator. - TestMeanOnlyTransformEstimatorMeanDerivs(feats, post, g); - TestMeanOnlyTransformEstimatorFeatDerivs(feats, post, g); + DifferentiableTransform *transform2 = + DifferentiableTransform::ReadNew(is2, binary); + std::ostringstream os3; + transform2->Write(os3, binary); + KALDI_ASSERT(os2.str() == os3.str()); + delete transform; + delete transform2; } - - - - } @@ -651,10 +235,9 @@ void UnitTestGaussianAndEstimators() { int main() { using namespace kaldi::differentiable_transform; - for (int32 i = 0; i < 50; i++) { - UnitTestCoreFmllrEstimatorSimple(); - UnitTestCoreFmllrEstimatorGeneral(); - UnitTestGaussianAndEstimators(); - } + UnitTestReadFromConfig(); + UnitTestIo(); + UnitTestTraining(); + std::cout << "Test OK.\n"; } diff --git a/src/adapt/differentiable-transform.cc b/src/adapt/differentiable-transform.cc index e622effc9ea..72e34e02764 100644 --- a/src/adapt/differentiable-transform.cc +++ b/src/adapt/differentiable-transform.cc @@ -42,7 +42,7 @@ int32 FmllrTransform::InitFromConfig( std::vector *config_lines) { KALDI_ASSERT(cur_pos < int32(config_lines->size())); ConfigLine *line = &((*config_lines)[cur_pos]); - KALDI_ASSERT(line->FirstToken() == "FmllrTransform"); + KALDI_ASSERT(line->FirstToken() == Type()); if (!line->GetValue("dim", &dim_) || dim_ <= 0) KALDI_ERR << "Dimension 'dim' must be specified for FmllrTransform, config " @@ -55,6 +55,17 @@ int32 FmllrTransform::InitFromConfig( return cur_pos + 1; } + +FmllrTransform::FmllrTransform(const FmllrTransform &other): + DifferentiableTransform(other), + dim_(other.dim_), fmllr_opts_(other.fmllr_opts_), + target_model_(other.target_model_ == NULL ? NULL : + new GaussianEstimator(*other.target_model_)) { } + +DifferentiableTransform *FmllrTransform::Copy() const { + return new FmllrTransform(*this); +} + void FmllrTransform::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); @@ -136,8 +147,10 @@ MinibatchInfoItf* FmllrTransform::TrainingForward( ans->estimators[speaker]->AccStats(this_input, this_posteriors); } BaseFloat objf_impr = 0.0; - for (int32 s = 0; s < num_spk; s++) - objf_impr += ans->estimators[s]->Estimate() / num_spk; + for (int32 s = 0; s < num_spk; s++) { + BaseFloat this_impr = ans->estimators[s]->Estimate(); + objf_impr += this_impr / num_spk; + } // objf_impr is now the average objective-function improvement per frame. // We will later find a better way to display this. KALDI_LOG << "Objective function improvement per frame is " @@ -240,6 +253,13 @@ void FmllrTransform::Accumulate( target_model_->AccStats(input_cpu, posteriors); } + +void FmllrTransform::Estimate(int32 final_iter) { + KALDI_ASSERT(final_iter == 0 && target_model_ != NULL); + target_model_->Estimate(fmllr_opts_); +} + + SpeakerStatsItf *FmllrTransform::GetEmptySpeakerStats() const { KALDI_ASSERT(target_model_ != NULL && target_model_->GetMeans().NumRows() != 0 && @@ -277,7 +297,6 @@ FmllrTransform::~FmllrTransform() { } - MeanOnlyTransformMinibatchInfo::MeanOnlyTransformMinibatchInfo( int32 num_classes, int32 dim, int32 num_speakers): target_model(num_classes, dim), @@ -294,7 +313,7 @@ int32 MeanOnlyTransform::InitFromConfig( std::vector *config_lines) { KALDI_ASSERT(cur_pos < int32(config_lines->size())); ConfigLine *line = &((*config_lines)[cur_pos]); - KALDI_ASSERT(line->FirstToken() == "MeanOnlyTransform"); + KALDI_ASSERT(line->FirstToken() == Type()); if (!line->GetValue("dim", &dim_) || dim_ <= 0) KALDI_ERR << "Dimension 'dim' must be specified for MeanOnlyTransform, config " @@ -306,6 +325,11 @@ int32 MeanOnlyTransform::InitFromConfig( return cur_pos + 1; } +MeanOnlyTransform::MeanOnlyTransform(const MeanOnlyTransform &other): + DifferentiableTransform(other), + dim_(other.dim_), target_model_(other.target_model_ == NULL ? NULL : + new GaussianEstimator(*other.target_model_)) { } + void MeanOnlyTransform::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); @@ -402,6 +426,11 @@ MinibatchInfoItf* MeanOnlyTransform::TrainingForward( return ans; } + +DifferentiableTransform *MeanOnlyTransform::Copy() const { + return new MeanOnlyTransform(*this); +} + void MeanOnlyTransform::TrainingBackward( const CuMatrixBase &input, const CuMatrixBase &output_deriv, @@ -489,6 +518,16 @@ void MeanOnlyTransform::Accumulate( target_model_->AccStats(input_cpu, posteriors); } +void MeanOnlyTransform::Estimate(int32 final_iter) { + KALDI_ASSERT(final_iter == 0 && target_model_ != NULL); + // The options only affect the estimates of the variance, which we don't use + // here, so we use the default options. + FmllrEstimatorOptions default_opts; + target_model_->Estimate(default_opts); +} + + + SpeakerStatsItf *MeanOnlyTransform::GetEmptySpeakerStats() const { KALDI_ASSERT(target_model_ != NULL && target_model_->GetMeans().NumRows() != 0 && diff --git a/src/adapt/differentiable-transform.h b/src/adapt/differentiable-transform.h index dce197e9444..c0dfe027969 100644 --- a/src/adapt/differentiable-transform.h +++ b/src/adapt/differentiable-transform.h @@ -48,7 +48,7 @@ namespace differentiable_transform { interface functions and their parameters. */ class FmllrTransform: public DifferentiableTransform { - + public: int32 InitFromConfig(int32 cur_pos, std::vector *config_lines) override; @@ -98,6 +98,8 @@ class FmllrTransform: public DifferentiableTransform { FmllrTransform(): target_model_(NULL) { } + std::string Type() const override { return "FmllrTransform"; } + DifferentiableTransform* Copy() const override; void Write(std::ostream &os, bool binary) const override; @@ -156,8 +158,7 @@ class FmllrSpeakerStats: public SpeakerStatsItf { This is like a mean-only fMLLR with fixed (say, unit) covariance model. */ class MeanOnlyTransform: public DifferentiableTransform { - - + public: /* Example config line: @@ -212,6 +213,8 @@ class MeanOnlyTransform: public DifferentiableTransform { MeanOnlyTransform(): target_model_(NULL) { } + std::string Type() const override { return "MeanOnlyTransform"; } + DifferentiableTransform* Copy() const override; void Write(std::ostream &os, bool binary) const override; diff --git a/src/adapt/generic-transform.h b/src/adapt/generic-transform.h index 3e75db9885b..9d8079b339a 100644 --- a/src/adapt/generic-transform.h +++ b/src/adapt/generic-transform.h @@ -18,8 +18,8 @@ // limitations under the License. -#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ -#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ +#ifndef KALDI_TRANSFORM_GENERIC_TRANSFORM_H_ +#define KALDI_TRANSFORM_GENERIC_TRANSFORM_H_ #include #include "base/kaldi-common.h" @@ -312,4 +312,4 @@ class AppendMinibatchInfo: public MinibatchInfoItf { } // namespace differentiable_transform } // namespace kaldi -#endif // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ +#endif // KALDI_TRANSFORM_GENERIC_TRANSFORM_H_ diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 1ff7daa01d1..53859e9b03c 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -81,9 +81,9 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute // static Component* Component::ReadNew(std::istream &is, bool binary) { std::string token; - ReadToken(is, binary, &token); // e.g. "". - token.erase(0, 1); // erase "<". - token.erase(token.length()-1); // erase ">". + ReadToken(is, binary, &token); // e.g. "". + token.erase(0, 1); // erase "<". + token.erase(token.length() - 1); // erase ">". Component *ans = NewComponentOfType(token); if (!ans) KALDI_ERR << "Unknown component type " << token; From 3b1351ffe282ec7d07ea00a4bd9bbacbf21ec261 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 12 Dec 2018 19:47:23 -0500 Subject: [PATCH 33/87] [src] Add more testing code; more bug fixes. --- src/adapt/differentiable-transform-itf.cc | 2 +- src/adapt/differentiable-transform-itf.h | 7 ++-- src/adapt/differentiable-transform-test.cc | 37 +++++++++++++++++++--- src/adapt/generic-transform.h | 4 ++- 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc index 684718a06be..aafb9abe86f 100644 --- a/src/adapt/differentiable-transform-itf.cc +++ b/src/adapt/differentiable-transform-itf.cc @@ -74,7 +74,7 @@ void DifferentiableTransform::TestingForwardBatch( int32 num_chunks, int32 num_spk, const Posterior &posteriors, - CuMatrixBase *output) { + CuMatrixBase *output) const { int32 dim = input.NumCols(), num_frames = input.NumRows(), chunks_per_spk = num_chunks / num_spk, diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index 69f56daa17f..e1b7e25c210 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -41,8 +41,9 @@ class MinibatchInfoItf { class SpeakerStatsItf { public: // Does any estimation that is required-- you call this after accumulating - // stats and before calling TestingForward(). - virtual void Estimate() = 0; + // stats and before calling TestingForward(). You'll normally want to + // override this, unless your object requires no estimation. + virtual void Estimate() { } virtual ~SpeakerStatsItf() { } }; @@ -300,7 +301,7 @@ class DifferentiableTransform { int32 num_chunks, int32 num_spk, const Posterior &posteriors, - CuMatrixBase *output); + CuMatrixBase *output) const; // Copies transform (deep copy). virtual DifferentiableTransform* Copy() const = 0; diff --git a/src/adapt/differentiable-transform-test.cc b/src/adapt/differentiable-transform-test.cc index 2715c1ee4bf..34efac9770c 100644 --- a/src/adapt/differentiable-transform-test.cc +++ b/src/adapt/differentiable-transform-test.cc @@ -155,7 +155,6 @@ void TestTraining(DifferentiableTransform *transform) { transform->TrainingBackward(input_feats, output_deriv, num_chunks, num_spk, post, info, &input_deriv); - int32 n = 5; Vector expected_changes(n), observed_changes(n); BaseFloat epsilon = 1.0e-03; @@ -176,6 +175,33 @@ void TestTraining(DifferentiableTransform *transform) { KALDI_LOG << "Expected changes: " << expected_changes << ", observed changes: " << observed_changes; KALDI_ASSERT(expected_changes.ApproxEqual(observed_changes, 0.15)); + + { + // Test that if we do Accumulate() and Estimate() on the same data we + // trained on, and then TestingForwardBatch(), we get the same answer + // as during training. Note: this may not be true for all examples + // including SequenceTransform, due to how we treat the last of the + // transforms specially. + + int32 num_final_iters = transform->NumFinalIterations(); + for (int32 i = 0; i < num_final_iters; i++) { + transform->Accumulate(i, input_feats, num_chunks, num_spk, post); + transform->Estimate(i); + } + CuMatrix output_feats2(output_feats.NumRows(), + output_feats.NumCols(), kUndefined); + transform->TestingForwardBatch(input_feats, num_chunks, num_spk, post, + &output_feats2); + output_feats2.AddMat(-1.0, output_feats); + BaseFloat rel_diff = (output_feats2.FrobeniusNorm() / + output_feats.FrobeniusNorm()); + KALDI_LOG << "Difference in features train vs. test (relative) is " + << rel_diff; + if (rel_diff > 0.001) { + KALDI_WARN << "Make sure this config would not be equivalent train " + "vs. test (see config printed above)."; + } + } } @@ -235,9 +261,10 @@ void UnitTestIo() { int main() { using namespace kaldi::differentiable_transform; - UnitTestReadFromConfig(); - UnitTestIo(); - UnitTestTraining(); - + for (int32 i = 0; i < 3; i++) { + UnitTestReadFromConfig(); + UnitTestIo(); + UnitTestTraining(); + } std::cout << "Test OK.\n"; } diff --git a/src/adapt/generic-transform.h b/src/adapt/generic-transform.h index 9d8079b339a..4d76e936e2a 100644 --- a/src/adapt/generic-transform.h +++ b/src/adapt/generic-transform.h @@ -79,7 +79,9 @@ class NoOpTransform: public DifferentiableTransform { const Posterior &posteriors) override { } - SpeakerStatsItf *GetEmptySpeakerStats() const override { return NULL; } + SpeakerStatsItf *GetEmptySpeakerStats() const override { + return new SpeakerStatsItf(); + } void TestingAccumulate( const MatrixBase &input, From f4a4f6f56fd0ef2b5cc421461efea14d0d46d3fc Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 13 Dec 2018 17:55:16 -0500 Subject: [PATCH 34/87] [src]Add new constructor for SubPosterior --- src/hmm/posterior.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h index 1c3e9efd38e..662a93d2478 100644 --- a/src/hmm/posterior.h +++ b/src/hmm/posterior.h @@ -68,6 +68,12 @@ class SubPosterior { data_(num_frames_ == 0 ? NULL : &(post[offset])) { KALDI_ASSERT(stride > 0 && post.size() > offset + (num_frames-1) * stride); } + SubPosterior(const SubPosterior &post, size_t offset, + size_t num_frames, size_t stride = 1): + num_frames_(num_frames), stride_(stride * post.stride_), + data_(num_frames_ == 0 ? NULL : post.data_ + (offset * post.stride_)) { + KALDI_ASSERT(offset + num_frames * (stride - 1) < post.num_frames_); + } size_t size() const { return num_frames_; } const std::vector > &operator[] (size_t i) const { KALDI_PARANOID_ASSERT(i < num_frames_); From 1210afb047c834a060e7fc2a26bcbb9dea59a922 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 14 Dec 2018 21:48:14 -0500 Subject: [PATCH 35/87] [src] Add missing file; make overrides consistent. --- src/adapt/generic-transform.cc | 564 +++++++++++++++++++++++++++++++++ src/adapt/generic-transform.h | 10 +- 2 files changed, 569 insertions(+), 5 deletions(-) create mode 100644 src/adapt/generic-transform.cc diff --git a/src/adapt/generic-transform.cc b/src/adapt/generic-transform.cc new file mode 100644 index 00000000000..12cbe938d03 --- /dev/null +++ b/src/adapt/generic-transform.cc @@ -0,0 +1,564 @@ +// adapt/generic-transform.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-transform-itf.h" +#include "adapt/generic-transform.h" + +namespace kaldi { +namespace differentiable_transform { + + +int32 NoOpTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size())); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == Type()); + if (!line->GetValue("dim", &dim_) || dim_ <= 0) + KALDI_ERR << "Dimension 'dim' must be specified for NoOpTransform, config " + "line is: " << line->WholeLine(); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + return cur_pos + 1; +} + + +void NoOpTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); +} + +void NoOpTransform::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); +} + + +int32 SequenceTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size()) && + transforms_.empty()); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == Type()); + int32 num_transforms = -1; + if (!line->GetValue("num-transforms", &num_transforms) || + num_transforms <= 0) + KALDI_ERR << "Config value num-transforms must be specified for " + "SequenceTransform, line is: " << line->WholeLine(); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + cur_pos++; + + int32 dim = 0; + for (int32 i = 0; i < num_transforms; i++) { + if (cur_pos >= int32(config_lines->size())) + KALDI_ERR << "Config file lacks enough lines for SequenceTransform."; + ConfigLine *other_line = &((*config_lines)[cur_pos]); + std::string transform_type = other_line->FirstToken(); + DifferentiableTransform *transform = NewTransformOfType(transform_type); + if (transform == NULL) + KALDI_ERR << "Could not find transform of type " << transform_type; + cur_pos = transform->InitFromConfig(cur_pos, config_lines); + if (i == 0) { + dim = transform->Dim(); + } else if (dim != transform->Dim()) { + KALDI_ERR << "Transforms used in SequenceTransform have inconsistent dim: " + << dim << " vs " << transform->Dim(); + } + transforms_.push_back(transform); + } + return cur_pos; +} + + +SequenceTransform::SequenceTransform(const SequenceTransform &other): + DifferentiableTransform(other), + transforms_(other.transforms_.size(), NULL) { + for (size_t i = 0; i < other.transforms_.size(); i++) + transforms_[i] = other.transforms_[i]->Copy(); +} + + +void SequenceTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + int32 num_transforms = transforms_.size(); + WriteBasicType(os, binary, num_transforms); + for (int32 i = 0; i < num_transforms; i++) + transforms_[i]->Write(os, binary); + WriteToken(os, binary, ""); +} + +void SequenceTransform::Read(std::istream &is, bool binary) { + while (!transforms_.empty()) { + delete transforms_.back(); + transforms_.pop_back(); + } + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + int32 num_transforms; + ReadBasicType(is, binary, &num_transforms); + for (int32 i = 0; i < num_transforms; i++) { + std::string tok; + ReadToken(is, binary, &tok); + DifferentiableTransform *transform; + if (!(transform = NewTransformOfType(tok))) + KALDI_ERR << "Expected the name of a transform, got " + << tok << " (maybe you should recompile?)"; + transform->Read(is, binary); + transforms_.push_back(transform); + } + ExpectToken(is, binary, ""); +} + +int32 SequenceTransform::Dim() const { + size_t num_transforms = transforms_.size(); + KALDI_ASSERT(num_transforms > 0); + return transforms_[0]->Dim(); +} + +void SequenceTransform::SetNumClasses(int32 num_classes) { + KALDI_ASSERT(num_classes > 0); + num_classes_ = num_classes; + for (size_t i = 0; i < transforms_.size(); i++) { + transforms_[i]->SetNumClasses(num_classes); + } +} + +SequenceTransform::~SequenceTransform() { + for (size_t i = 0; i < transforms_.size(); i++) + delete transforms_[i]; +} + +MinibatchInfoItf* SequenceTransform::TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + KALDI_ASSERT(SameDim(input, *output) && + !transforms_.empty()); + SequenceMinibatchInfo *ans = new SequenceMinibatchInfo(); + + const CuMatrixBase *last_output = &input; + CuMatrixBase *this_output; + + ans->outputs.resize(transforms_.size() - 1); + + for (size_t i = 0; i < transforms_.size(); i++) { + if (i + 1 == transforms_.size()) { + this_output = output; + } else { + // not the final transform. + ans->outputs[i].Resize(output->NumRows(), output->NumCols(), kUndefined); + this_output = &(ans->outputs[i]); + } + ans->info_vec.push_back(transforms_[i]->TrainingForward( + *last_output, num_chunks, num_spk, posteriors, this_output)); + last_output = this_output; + } + return ans; +} + +void SequenceTransform::TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const { + KALDI_ASSERT(SameDim(input, output_deriv) && SameDim(input, *input_deriv)); + + SequenceMinibatchInfo *info = dynamic_cast(minibatch_info); + KALDI_ASSERT(info != NULL && "Mismatched MinibatchInfo type?"); + + CuMatrix temp_deriv(input.NumRows(), + input.NumCols()); + int32 num_transforms = transforms_.size(); + KALDI_ASSERT(num_transforms > 0); + + const CuMatrixBase *cur_output_deriv = &output_deriv; + + for (int32 i = num_transforms - 1; i >= 0; i--) { + const CuMatrixBase *cur_input = (i == 0 ? &input : + &(info->outputs[i-1])); + CuMatrixBase *cur_input_deriv; + if (i == 0) { + cur_input_deriv = input_deriv; + } else if (i == num_transforms - 1) { + cur_input_deriv = &temp_deriv; + } else { + // this matrix is no longer needed, store the intermediate deriv here. + cur_input_deriv = &(info->outputs[i]); + cur_input_deriv->SetZero(); + } + transforms_[i]->TrainingBackward(*cur_input, *cur_output_deriv, + num_chunks, num_spk, posteriors, + info->info_vec[i], cur_input_deriv); + info->info_vec[i] = NULL; // Prevent it from being deleted twice. + cur_output_deriv = cur_input_deriv; + } + delete info; // This function took ownership. +} + +int32 SequenceTransform::NumFinalIterations() { + int32 ans = 0; + for (size_t i = 0; i < transforms_.size(); i++) + ans += transforms_[i]->NumFinalIterations(); + return ans; +} + +void SequenceTransform::Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) { + CuMatrix temp; + const CuMatrixBase *cur_input = &input; + + int32 prev_final_iters = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 nf = transforms_[i]->NumFinalIterations(); + if (final_iter < prev_final_iters + nf) { + transforms_[i]->Accumulate(final_iter - prev_final_iters, + *cur_input, num_chunks, num_spk, + posteriors); + return; + } else { + KALDI_ASSERT(i + 1 < transforms_.size()); + // We have to propagate the features through this transform. + CuMatrix this_output(input.NumRows(), input.NumCols(), + kUndefined); + transforms_[i]->TestingForwardBatch(*cur_input, num_chunks, num_spk, + posteriors, &this_output); + temp.Swap(&this_output); + cur_input = &temp; + } + prev_final_iters += nf; + } + KALDI_ERR << "final_iter out of range."; +} + +void SequenceTransform::Estimate(int32 final_iter) { + CuMatrix temp; + + int32 prev_final_iters = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 nf = transforms_[i]->NumFinalIterations(); + if (final_iter < prev_final_iters + nf) { + transforms_[i]->Estimate(final_iter - prev_final_iters); + return; + } + prev_final_iters += nf; + } + KALDI_ERR << "final_iter out of range."; +} + +void SequenceTransform::TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const { + transforms_.back()->TestingAccumulate(input, posteriors, + speaker_stats); +} + +void SequenceTransform::TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const { + transforms_.back()->TestingForward(input, speaker_stats, output); +} + + +SequenceMinibatchInfo::~SequenceMinibatchInfo() { + for (size_t i = 0; i < info_vec.size(); i++) + delete info_vec[i]; +} + + + +int32 AppendTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size()) && + transforms_.empty()); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == Type()); + int32 num_transforms = -1; + if (!line->GetValue("num-transforms", &num_transforms) || + num_transforms <= 0) + KALDI_ERR << "Config value num-transforms must be specified for " + "AppendTransform, line is: " << line->WholeLine(); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + cur_pos++; + + for (int32 i = 0; i < num_transforms; i++) { + if (cur_pos >= int32(config_lines->size())) + KALDI_ERR << "Config file lacks enough lines for AppendTransform."; + ConfigLine *other_line = &((*config_lines)[cur_pos]); + std::string transform_type = other_line->FirstToken(); + DifferentiableTransform *transform = NewTransformOfType(transform_type); + if (transform == NULL) + KALDI_ERR << "Could not find transform of type " << transform_type; + cur_pos = transform->InitFromConfig(cur_pos, config_lines); + transforms_.push_back(transform); + } + return cur_pos; +} + + + +AppendTransform::AppendTransform(const AppendTransform &other): + DifferentiableTransform(other), + transforms_(other.transforms_.size(), NULL) { + for (size_t i = 0; i < other.transforms_.size(); i++) + transforms_[i] = other.transforms_[i]->Copy(); +} + + + +void AppendTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + int32 num_transforms = transforms_.size(); + WriteBasicType(os, binary, num_transforms); + for (int32 i = 0; i < num_transforms; i++) + transforms_[i]->Write(os, binary); + WriteToken(os, binary, ""); +} + +void AppendTransform::Read(std::istream &is, bool binary) { + while (!transforms_.empty()) { + delete transforms_.back(); + transforms_.pop_back(); + } + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + int32 num_transforms; + ReadBasicType(is, binary, &num_transforms); + for (int32 i = 0; i < num_transforms; i++) { + std::string tok; + ReadToken(is, binary, &tok); + DifferentiableTransform *transform; + if (!(transform = NewTransformOfType(tok))) + KALDI_ERR << "Expected the name of a transform, got " + << tok << " (maybe you should recompile?)"; + transform->Read(is, binary); + transforms_.push_back(transform); + } + ExpectToken(is, binary, ""); +} + +int32 AppendTransform::Dim() const { + size_t num_transforms = transforms_.size(); + KALDI_ASSERT(num_transforms > 0); + int32 ans = 0; + for (size_t i = 0; i < num_transforms; i++) + ans += transforms_[i]->Dim(); + return ans; +} + +void AppendTransform::SetNumClasses(int32 num_classes) { + num_classes_ = num_classes; + for (size_t i = 0; i < transforms_.size(); i++) { + transforms_[i]->SetNumClasses(num_classes); + } +} + +AppendTransform::~AppendTransform() { + for (size_t i = 0; i < transforms_.size(); i++) + delete transforms_[i]; +} + + +MinibatchInfoItf* AppendTransform::TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + KALDI_ASSERT(input.NumCols() == Dim() && + SameDim(input, *output)); + AppendMinibatchInfo *ans = new AppendMinibatchInfo(); + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + CuSubMatrix input_part = input.ColRange(dim_offset, this_dim), + output_part = output->ColRange(dim_offset, this_dim); + ans->info_vec.push_back(transforms_[i]->TrainingForward( + input_part, num_chunks, num_spk, posteriors, &output_part)); + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == input.NumCols()); + return ans; +} + +void AppendTransform::TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const { + AppendMinibatchInfo *info = dynamic_cast(minibatch_info); + KALDI_ASSERT(info != NULL && "Mismatched MinibatchInfo type?"); + + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + CuSubMatrix input_part = input.ColRange(dim_offset, this_dim), + output_deriv_part = output_deriv.ColRange(dim_offset, this_dim), + input_deriv_part = input_deriv->ColRange(dim_offset, this_dim); + transforms_[i]->TrainingBackward( + input_part, output_deriv_part, num_chunks, num_spk, + posteriors, info->info_vec[i], &input_deriv_part); + info->info_vec[i] = NULL; // Prevent it from being deleted twice. + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == input.NumCols()); + delete info; // This function took ownership. +} + +int32 AppendTransform::NumFinalIterations() { + int32 ans = 0; + for (size_t i = 0; i < transforms_.size(); i++) + ans = std::max(ans, transforms_[i]->NumFinalIterations()); + return ans; +} + + +void AppendTransform::Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) { + int32 num_final_iters = 0, + dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_nf = transforms_[i]->NumFinalIterations(), + this_dim = transforms_[i]->Dim(); + if (final_iter < this_nf) + transforms_[i]->Accumulate(final_iter, + input.ColRange(dim_offset, this_dim), + num_chunks, num_spk, posteriors); + if (this_nf > num_final_iters) + num_final_iters = this_nf; + dim_offset += this_dim; + } + KALDI_ASSERT(final_iter >= 0 && final_iter < num_final_iters); +} + +void AppendTransform::Estimate(int32 final_iter) { + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_nf = transforms_[i]->NumFinalIterations(); + if (final_iter < this_nf) { + transforms_[i]->Estimate(final_iter); + } + } +} + +AppendMinibatchInfo::~AppendMinibatchInfo() { + for (size_t i = 0; i < info_vec.size(); i++) + delete info_vec[i]; +} + +SpeakerStatsItf* AppendTransform::GetEmptySpeakerStats() const { + AppendSpeakerStats *ans = new AppendSpeakerStats(); + for (size_t i = 0; i < transforms_.size(); i++) + ans->stats.push_back(transforms_[i]->GetEmptySpeakerStats()); + return ans; +} + +void AppendTransform::TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const { + AppendSpeakerStats *stats = dynamic_cast(speaker_stats); + KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() && + "Wrong type of stats supplied to AppendTransform."); + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + SubMatrix input_part = input.ColRange(dim_offset, this_dim); + transforms_[i]->TestingAccumulate(input_part, posteriors, + stats->stats[i]); + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == input.NumCols()); +} + + +void AppendTransform::TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const { + const AppendSpeakerStats *stats = + dynamic_cast(&speaker_stats); + KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() && + "Wrong type of stats supplied to AppendTransform."); + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + SubMatrix input_part = input.ColRange(dim_offset, this_dim), + output_part = output->ColRange(dim_offset, this_dim); + transforms_[i]->TestingForward(input_part, *(stats->stats[i]), + &output_part); + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == input.NumCols()); +} + +void AppendSpeakerStats::Estimate() { + for (size_t i = 0; i < stats.size(); i++) + stats[i]->Estimate(); +} + +AppendSpeakerStats::~AppendSpeakerStats() { + for (size_t i = 0; i < stats.size(); i++) + delete stats[i]; +} + + +} // namespace differentiable_transform +} // namespace kaldi diff --git a/src/adapt/generic-transform.h b/src/adapt/generic-transform.h index 4d76e936e2a..3d910e471cb 100644 --- a/src/adapt/generic-transform.h +++ b/src/adapt/generic-transform.h @@ -57,7 +57,7 @@ class NoOpTransform: public DifferentiableTransform { output->CopyFromMat(input); return NULL; } - virtual void TrainingBackward( + void TrainingBackward( const CuMatrixBase &input, const CuMatrixBase &output_deriv, int32 num_chunks, @@ -69,7 +69,7 @@ class NoOpTransform: public DifferentiableTransform { input_deriv->AddMat(1.0, output_deriv); } - virtual int32 NumFinalIterations() { return 0; } + int32 NumFinalIterations() override { return 0; } void Accumulate( int32 final_iter, @@ -151,7 +151,7 @@ class SequenceTransform: public DifferentiableTransform { int32 num_spk, const Posterior &posteriors, CuMatrixBase *output) const override; - virtual void TrainingBackward( + void TrainingBackward( const CuMatrixBase &input, const CuMatrixBase &output_deriv, int32 num_chunks, @@ -249,7 +249,7 @@ class AppendTransform: public DifferentiableTransform { int32 num_spk, const Posterior &posteriors, CuMatrixBase *output) const override; - virtual void TrainingBackward( + void TrainingBackward( const CuMatrixBase &input, const CuMatrixBase &output_deriv, int32 num_chunks, @@ -274,7 +274,7 @@ class AppendTransform: public DifferentiableTransform { const SubPosterior &posteriors, SpeakerStatsItf *speaker_stats) const override; - virtual void TestingForward( + void TestingForward( const MatrixBase &input, const SpeakerStatsItf &speaker_stats, MatrixBase *output) const override; From 88890a254caf7c14fabf46059631db9e1957ce6a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 15 Dec 2018 10:05:37 -0800 Subject: [PATCH 36/87] [src,egs] Documentation updates --- .../libs/nnet3/train/dropout_schedule.py | 20 ++++++++++++++++++- src/nnet3/nnet-chain-example.h | 7 +++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py index 0ad93e5977d..0de9074517f 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py @@ -186,9 +186,22 @@ def _get_component_dropout(dropout_schedule, data_fraction): def _get_dropout_proportions(dropout_schedule, data_fraction): """Returns dropout proportions based on the dropout_schedule for the - fraction of data seen at this stage of training. + fraction of data seen at this stage of training. Returns a list of + pairs (pattern, dropout_proportion); for instance, it might return + the list ['*', 0.625] meaning a dropout proportion of 0.625 is to + be applied to all dropout components. + Returns None if dropout_schedule is None. + dropout_schedule might be (in the sample case using the default pattern of + '*'): '0.1,0.5@0.5,0.1', meaning a piecewise linear function that starts at + 0.1 when data_fraction=0.0, rises to 0.5 when data_fraction=0.5, and falls + again to 0.1 when data_fraction=1.0. It can also contain space-separated + items of the form 'pattern=schedule', for instance: + '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0' + The more specific patterns should go later, otherwise they will be overridden + by the less specific patterns' commands. + Calls _get_component_dropout() for the different component name patterns in dropout_schedule. @@ -198,6 +211,7 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): See _self_test() for examples. data_fraction: The fraction of data seen until this stage of training. + """ if dropout_schedule is None: return None @@ -213,6 +227,10 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): """Return an nnet3-copy --edits line to modify raw_model_string to set dropout proportions according to dropout_proportions. + E.g. if _dropout_proportions(dropout_schedule, data_fraction) + returns [('*', 0.625)], this will return the string: + "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'" + Arguments: dropout_schedule: Value for the --trainer.dropout-schedule option. diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index 187bb4ef3a3..cdb7338994a 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -60,7 +60,10 @@ struct NnetChainSupervision { std::vector indexes; - /// The supervision object, containing the FST. + /// The supervision object, containing the FST; its members are + /// weight, num_sequences, frames_per_sequence, label_dim, fst, + /// e2e_fsts (for e2e examples only); alignment_pdfs (which is required + /// only for nnet3-chain-acc-lda-stats). chain::Supervision supervision; /// This is a vector of per-frame weights, required to be between 0 and 1, @@ -270,7 +273,7 @@ class ChainExampleMerger { std::vector, NnetChainExampleStructureHasher, NnetChainExampleStructureCompare> MapType; -MapType eg_to_egs_; + MapType eg_to_egs_; }; From a52a6fcded08e7373df1981429259aa02a5f88a4 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 15 Dec 2018 10:49:36 -0800 Subject: [PATCH 37/87] [src] Add some unfinished sources --- src/Makefile | 1 + src/nnet3a/Makefile | 23 ++ src/nnet3a/nnet-chaina-training.cc | 325 +++++++++++++++++++++++++++++ src/nnet3a/nnet-chaina-training.h | 129 ++++++++++++ src/nnet3a/notes.update | 219 +++++++++++++++++++ 5 files changed, 697 insertions(+) create mode 100644 src/nnet3a/Makefile create mode 100644 src/nnet3a/nnet-chaina-training.cc create mode 100644 src/nnet3a/nnet-chaina-training.h create mode 100644 src/nnet3a/notes.update diff --git a/src/Makefile b/src/Makefile index 8ddd579a9a5..9a5bb5f81d5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -172,6 +172,7 @@ adapt: base util matrix hmm cudamatrix nnet: base util hmm tree matrix cudamatrix nnet2: base util matrix lat gmm hmm tree transform cudamatrix nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext +nnet3a: base util matrix lat gmm hmm tree transform cudamatrix adapt nnet3 chain fstext rnnlm: base util matrix cudamatrix nnet3 lm hmm chain: lat hmm tree fstext matrix cudamatrix util base ivector: base util matrix transform tree gmm diff --git a/src/nnet3a/Makefile b/src/nnet3a/Makefile new file mode 100644 index 00000000000..8cca3ea5a05 --- /dev/null +++ b/src/nnet3a/Makefile @@ -0,0 +1,23 @@ +all: + +# This directory contains code related to the adaptation +# framework in ../adapt, for nnet3 and (principally) chain +# training. + +include ../kaldi.mk + +TESTFILES = + +OBJFILES = + +LIBNAME = kaldi-nnet3a + +ADDLIBS = ../fstext/kaldi-fstext.a ../chain/kaldi-chain.a \ + ../nnet3/kaldi-nnet3.a ../adapt/kaldi-adapt.a \ + ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ + ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../lat/kaldi-lat.a \ + ../matrix-kaldi-matrix.a ../util/kaldi-util.a \ + ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc new file mode 100644 index 00000000000..a798cb597f5 --- /dev/null +++ b/src/nnet3a/nnet-chaina-training.cc @@ -0,0 +1,325 @@ +// nnet3/nnet-chain-training.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3/nnet-chain-training.h" +#include "nnet3/nnet-utils.h" + +namespace kaldi { +namespace nnet3 { + +NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, + const fst::StdVectorFst &den_fst, + Nnet *nnet): + opts_(opts), + den_graph_(den_fst, nnet->OutputDim("output")), + nnet_(nnet), + compiler_(*nnet, opts_.nnet_config.optimize_config, + opts_.nnet_config.compiler_config), + num_minibatches_processed_(0), + srand_seed_(RandInt(0, 100000)) { + if (opts.nnet_config.zero_component_stats) + ZeroComponentStats(nnet); + KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 && + opts.nnet_config.max_param_change >= 0.0 && + opts.nnet_config.backstitch_training_interval > 0); + delta_nnet_ = nnet_->Copy(); + ScaleNnet(0.0, delta_nnet_); + const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); + num_max_change_per_component_applied_.resize(num_updatable, 0); + num_max_change_global_applied_ = 0; + + if (opts.nnet_config.read_cache != "") { + bool binary; + try { + Input ki(opts.nnet_config.read_cache, &binary); + compiler_.ReadCache(ki.Stream(), binary); + KALDI_LOG << "Read computation cache from " << opts.nnet_config.read_cache; + } catch (...) { + KALDI_WARN << "Could not open cached computation. " + "Probably this is the first training iteration."; + } + } +} + + +void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { + bool need_model_derivative = true; + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0); + ComputationRequest request; + GetChainComputationRequest(*nnet_, chain_eg, need_model_derivative, + nnet_config.store_component_stats, + use_xent_regularization, need_model_derivative, + &request); + std::shared_ptr computation = compiler_.Compile(request); + + if (nnet_config.backstitch_training_scale > 0.0 && num_minibatches_processed_ + % nnet_config.backstitch_training_interval == + srand_seed_ % nnet_config.backstitch_training_interval) { + // backstitch training is incompatible with momentum > 0 + KALDI_ASSERT(nnet_config.momentum == 0.0); + FreezeNaturalGradient(true, delta_nnet_); + bool is_backstitch_step1 = true; + srand(srand_seed_ + num_minibatches_processed_); + ResetGenerators(nnet_); + TrainInternalBackstitch(chain_eg, *computation, is_backstitch_step1); + FreezeNaturalGradient(false, delta_nnet_); // un-freeze natural gradient + is_backstitch_step1 = false; + srand(srand_seed_ + num_minibatches_processed_); + ResetGenerators(nnet_); + TrainInternalBackstitch(chain_eg, *computation, is_backstitch_step1); + } else { // conventional training + TrainInternal(chain_eg, *computation); + } + if (num_minibatches_processed_ == 0) { + ConsolidateMemory(nnet_); + ConsolidateMemory(delta_nnet_); + } + num_minibatches_processed_++; +} + +void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, + const NnetComputation &computation) { + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. + NnetComputer computer(nnet_config.compute_config, computation, + nnet_, delta_nnet_); + + // give the inputs to the computer object. + computer.AcceptInputs(*nnet_, eg.inputs); + computer.Run(); + + this->ProcessOutputs(false, eg, &computer); + computer.Run(); + + // If relevant, add in the part of the gradient that comes from L2 + // regularization. + ApplyL2Regularization(*nnet_, + GetNumNvalues(eg.inputs, false) * + nnet_config.l2_regularize_factor, + delta_nnet_); + + // Updates the parameters of nnet + bool success = UpdateNnetWithMaxChange(*delta_nnet_, + nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_, + &num_max_change_per_component_applied_, &num_max_change_global_applied_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + + // Scale delta_nnet + if (success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); +} + +void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, + const NnetComputation &computation, + bool is_backstitch_step1) { + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. + NnetComputer computer(nnet_config.compute_config, computation, + nnet_, delta_nnet_); + // give the inputs to the computer object. + computer.AcceptInputs(*nnet_, eg.inputs); + computer.Run(); + + bool is_backstitch_step2 = !is_backstitch_step1; + this->ProcessOutputs(is_backstitch_step2, eg, &computer); + computer.Run(); + + BaseFloat max_change_scale, scale_adding; + if (is_backstitch_step1) { + // max-change is scaled by backstitch_training_scale; + // delta_nnet is scaled by -backstitch_training_scale when added to nnet; + max_change_scale = nnet_config.backstitch_training_scale; + scale_adding = -nnet_config.backstitch_training_scale; + } else { + // max-change is scaled by 1 + backstitch_training_scale; + // delta_nnet is scaled by 1 + backstitch_training_scale when added to nnet; + max_change_scale = 1.0 + nnet_config.backstitch_training_scale; + scale_adding = 1.0 + nnet_config.backstitch_training_scale; + // If relevant, add in the part of the gradient that comes from L2 + // regularization. It may not be optimally inefficient to do it on both + // passes of the backstitch, like we do here, but it probably minimizes + // any harmful interactions with the max-change. + ApplyL2Regularization(*nnet_, + 1.0 / scale_adding * GetNumNvalues(eg.inputs, false) * + nnet_config.l2_regularize_factor, delta_nnet_); + } + + // Updates the parameters of nnet + UpdateNnetWithMaxChange(*delta_nnet_, + nnet_config.max_param_change, max_change_scale, scale_adding, nnet_, + &num_max_change_per_component_applied_, &num_max_change_global_applied_); + + if (is_backstitch_step1) { + // The following will only do something if we have a LinearComponent or + // AffineComponent with orthonormal-constraint set to a nonzero value. We + // choose to do this only on the 1st backstitch step, for efficiency. + ConstrainOrthonormal(nnet_); + } + + if (!is_backstitch_step1) { + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). Do this + // after backstitch step 2 so that the stats are scaled down before we start + // the next minibatch. + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + } + + ScaleNnet(0.0, delta_nnet_); +} + +void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, + const NnetChainExample &eg, + NnetComputer *computer) { + // normally the eg will have just one output named 'output', but + // we don't assume this. + // In backstitch training, the output-name with the "_backstitch" suffix is + // the one computed after the first, backward step of backstitch. + const std::string suffix = (is_backstitch_step2 ? "_backstitch" : ""); + std::vector::const_iterator iter = eg.outputs.begin(), + end = eg.outputs.end(); + for (; iter != end; ++iter) { + const NnetChainSupervision &sup = *iter; + int32 node_index = nnet_->GetNodeIndex(sup.name); + if (node_index < 0 || + !nnet_->IsOutputNode(node_index)) + KALDI_ERR << "Network has no output named " << sup.name; + + const CuMatrixBase &nnet_output = computer->GetOutput(sup.name); + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); + + bool use_xent = (opts_.chain_config.xent_regularize != 0.0); + std::string xent_name = sup.name + "-xent"; // typically "output-xent". + CuMatrix xent_deriv; + + BaseFloat tot_objf, tot_l2_term, tot_weight; + + ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + + if (use_xent) { + // this block computes the cross-entropy objective. + const CuMatrixBase &xent_output = computer->GetOutput( + xent_name); + // at this point, xent_deriv is posteriors derived from the numerator + // computation. note, xent_objf has a factor of '.supervision.weight' + BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); + objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, xent_objf); + } + + if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) { + CuVector cu_deriv_weights(sup.deriv_weights); + nnet_output_deriv.MulRowsVec(cu_deriv_weights); + if (use_xent) + xent_deriv.MulRowsVec(cu_deriv_weights); + } + + computer->AcceptInput(sup.name, &nnet_output_deriv); + + objf_info_[sup.name + suffix].UpdateStats(sup.name + suffix, + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, tot_l2_term); + + if (use_xent) { + xent_deriv.Scale(opts_.chain_config.xent_regularize); + computer->AcceptInput(xent_name, &xent_deriv); + } + } +} + +bool NnetChainTrainer::PrintTotalStats() const { + unordered_map::const_iterator + iter = objf_info_.begin(), + end = objf_info_.end(); + bool ans = false; + for (; iter != end; ++iter) { + const std::string &name = iter->first; + const ObjectiveFunctionInfo &info = iter->second; + ans = info.PrintTotalStats(name) || ans; + } + PrintMaxChangeStats(); + return ans; +} + +void NnetChainTrainer::PrintMaxChangeStats() const { + KALDI_ASSERT(delta_nnet_ != NULL); + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + int32 i = 0; + for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { + Component *comp = delta_nnet_->GetComponent(c); + if (comp->Properties() & kUpdatableComponent) { + UpdatableComponent *uc = dynamic_cast(comp); + if (uc == NULL) + KALDI_ERR << "Updatable component does not inherit from class " + << "UpdatableComponent; change this code."; + if (num_max_change_per_component_applied_[i] > 0) + KALDI_LOG << "For " << delta_nnet_->GetComponentName(c) + << ", per-component max-change was enforced " + << (100.0 * num_max_change_per_component_applied_[i]) / + (num_minibatches_processed_ * + (nnet_config.backstitch_training_scale == 0.0 ? 1.0 : + 1.0 + 1.0 / nnet_config.backstitch_training_interval)) + << " \% of the time."; + i++; + } + } + if (num_max_change_global_applied_ > 0) + KALDI_LOG << "The global max-change was enforced " + << (100.0 * num_max_change_global_applied_) / + (num_minibatches_processed_ * + (nnet_config.backstitch_training_scale == 0.0 ? 1.0 : + 1.0 + 1.0 / nnet_config.backstitch_training_interval)) + << " \% of the time."; +} + +NnetChainTrainer::~NnetChainTrainer() { + if (opts_.nnet_config.write_cache != "") { + Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache); + compiler_.WriteCache(ko.Stream(), opts_.nnet_config.binary_write_cache); + KALDI_LOG << "Wrote computation cache to " << opts_.nnet_config.write_cache; + } + delete delta_nnet_; +} + + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h new file mode 100644 index 00000000000..bc11212451e --- /dev/null +++ b/src/nnet3a/nnet-chaina-training.h @@ -0,0 +1,129 @@ +// nnet3a/nnet-chaina-training.h + +// Copyright 2015-2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET3_NNET_CHAIN_TRAINING_H_ +#define KALDI_NNET3_NNET_CHAIN_TRAINING_H_ + +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-computation.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-optimize.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-training.h" +#include "chain/chain-training.h" +#include "chain/chain-den-graph.h" + +namespace kaldi { +namespace nnet3 { + + + +struct NnetChainaTrainingOptions { + NnetTrainerOptions bottom_nnet_config; + NnetTrainerOptions top_nnet_config; + bool train_bottom_nnet; // True if we will be training the bottom nnet. + bool train_top_nnet; // True if we will be training the top nnet. Either + // this or train_bottom_nnet must be true (else, what + // are we doing here?) + + chain::ChainTrainingOptions chain_config; + bool apply_deriv_weights; + NnetChainTrainingOptions(): apply_deriv_weights(true) { } + + void Register(OptionsItf *opts) { + // register bottom_nnet_config with the prefix bottom + ParseO + nnet_config.Register(opts); + chain_config.Register(opts); + opts->Register("apply-deriv-weights", &apply_deriv_weights, + "If true, apply the per-frame derivative weights stored with " + "the example"); + } +}; + + +/** + This class is for single-threaded training of neural nets using the 'chain' + model. +*/ +class NnetChainTrainer { + public: + NnetChainTrainer(const NnetChainTrainingOptions &config, + const fst::StdVectorFst &den_fst, + Nnet *nnet); + + // train on one minibatch. + void Train(const NnetChainExample &eg); + + // Prints out the final stats, and return true if there was a nonzero count. + bool PrintTotalStats() const; + + // Prints out the max-change stats (if nonzero): the percentage of time that + // per-component max-change and global max-change were enforced. + void PrintMaxChangeStats() const; + + ~NnetChainTrainer(); + private: + // The internal function for doing one step of conventional SGD training. + void TrainInternal(const NnetChainExample &eg, + const NnetComputation &computation); + + // The internal function for doing one step of backstitch training. Depending + // on whether is_backstitch_step1 is true, It could be either the first + // (backward) step, or the second (forward) step of backstitch. + void TrainInternalBackstitch(const NnetChainExample &eg, + const NnetComputation &computation, + bool is_backstitch_step1); + + void ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, + NnetComputer *computer); + + const NnetChainTrainingOptions opts_; + + chain::DenominatorGraph den_graph_; + Nnet *nnet_; + Nnet *delta_nnet_; // Only used if momentum != 0.0 or max-param-change != + // 0.0. nnet representing accumulated parameter-change + // (we'd call this gradient_nnet_, but due to + // natural-gradient update, it's better to consider it as + // a delta-parameter nnet. + CachingOptimizingCompiler compiler_; + + // This code supports multiple output layers, even though in the + // normal case there will be just one output layer named "output". + // So we store the objective functions per output layer. + int32 num_minibatches_processed_; + + // stats for max-change. + std::vector num_max_change_per_component_applied_; + int32 num_max_change_global_applied_; + + unordered_map objf_info_; + + // This value is used in backstitch training when we need to ensure + // consistent dropout masks. It's set to a value derived from rand() + // when the class is initialized. + int32 srand_seed_; +}; + + +} // namespace nnet3 +} // namespace kaldi + +#endif // KALDI_NNET3_NNET_CHAIN_TRAINING_H_ diff --git a/src/nnet3a/notes.update b/src/nnet3a/notes.update new file mode 100644 index 00000000000..f9e00758821 --- /dev/null +++ b/src/nnet3a/notes.update @@ -0,0 +1,219 @@ + + +Plans for binaries. + + nnet3-adapt --init|--copy|--adapt + + + + steps/chaina/init_chain_dir.sh + make den.fst, normalization.fst, + bottom.config, top.config, + bottom.raw, top.raw + +init.config, init.raw, 0.trans_mdl, + final.config (but not 0.raw yet, might need egs first). + + + + +nnet3-get-egs? + ... Make sure the length info and left/right context of each eg is included in the id? + - when we merge, + + steps/chaina/get_raw_egs.sh + + -- need to decide utts-per-spk-max in validation data? do it in process_egs. + + + ... takes options like --utts-per-spk-max --num-utts-subset --frames-per-job + (prev. frames-per-iter), --chunks-per-group (e.g. 4) + + steps/chaina/process_egs.sh [options] + + [shouldn't need any info not already in raw_egs dir, I hope. We'll later have a + multilingual version of this script]. + + steps/chaina/process_egs.sh [options] + + +======== + Monolingual case (training): + + README.txt + bottom.raw default.ada default.mdl default.den + info -> mfcc.config?? Or other config? + info.txt? + frame_subsampling_factor1 + frame_subsampling_factor2 + frame_subsampling_factor +.. we'll need to pass in chain opts such as: + +[for chain objective] + --leaky-hmm-coefficient +[for the neural nets]: + --max-param-change-{bottom,top} + --print-interval + --l2-regularize-factor (use same one). + --train-bottom-nnet {true,false} + +==== + - Merging egs: will already have merged into speaker groups in prepare_egs. + - Output names? output --> output-xent. + - Input names? Just input. (May add ivector later but I hope not to have to). + - Could modify nnet3-merge-egs to parse the keys and get weights and output + names (to keep the output names distinct and to incorporate the weights). + + -- Initially, in nnet3*get-egs, we'll dump with: + + utterance-id-{num_frames}-{left_context}-{right_context} + + We'll use that info, together with the speaker-id and utt2uniq information, to + merge chunks together into groups (preferably by utterance; if not, by speaker) + in process_egs.sh (the merging will be done in python). + + process_egs.sh will dump these as archives *and* scp files, but they will now + be in groups of 4. The network name will be added as the last-but-one field + in the key; we'll set it to 'default' by default, but it may be changed in + merge_egs.sh. The last field will be a weight to be incorporated just before + the final merge (by nnet3-chain-merge-egs with the --interpret-keys option). + So the keys at the input to the final merge will be of the form + + + info/chunks_per_spk + + We may also have a combine_egs.sh script which can combine egs from multiple + sources (assuming they have the same chunks_per_spk), and can assign them + to different language names if needed. + +==== + + Merging already-merged chain egs + + This is something that I am going to need for the new adaptation framework I am + working on. Currently in nnet-example-utils.cc and nnet-chain-example.cc, the + example-merging code does not support merging already-merged egs (search for already-merged). + This is something that I'm going to need to be supported at least in NnetChainExample, and + this would also need to be supported, I think, in the NnetExample merging code, since + I think the chain example merging code supports that code. If it would be helpful in + implementation, you may assume that all the egs to be merged have the same number + of 'n' values (e.g. it might be 4; it's the number of chunks per speaker that we use + for adaptation). + + After the examples have been merged I'd like a variable as follows to be set in + the NnetChainSupervision object: +``` + // This will be 1 in normal cases, but in the 'chaina' code (chain training + // with adaptation) it will be set to the number of chunks per speaker in + // this minibatch. For example if it's 4, then we are asserting that + // sequences n=0 through 3 all come from the same speaker, n=4 through 7 + // all come from the same speaker, and so on. + int32 chunks_per_spk; +``` +Please make sure this is 1 by default (e.g. in the constructor), that the +on-disk format stays the same when it's 1 (e.g. only write it if it's not 1) to +minimize code-version compatibility headaches; and only set it to +a value other than 1 when merging chain supervision objects that were +already merged (you can check that the sizes of the things being merged match). +We may later introduce such a variable in the NnetSupervision object, but +it's not needed just yet. + +This PR can go to my svd_draft branch in my personal repo, as it's part of +that project. +==== + +Interpreting keys when merging nnet and chain examples + +This is a change that will need to be made to nnet3-chain-merge-egs binary to support +the new adaptation framework. @hhadian, again, please get to this when you can but +it is not urgent at all. If someone else feels like they want to do it that's OK +with me too as long as you don't just sit on it without making progress, but please +have @hhadian check the code. +In ExampleMergingConfig, please add a new boolean config value, default false, registered +as follows: + + po->Register("interpret-keys", &interpret_keys, "If true, require the keys " + "on the example to end in something of the form -xxxxxx-yyy " + "where xxxxxx is a string with only letters, numbers and _, " + "which will be interpreted as a language-name (e.g. \"default\"," + "\"english\", \"french\"), and yyy is a floating point weight " + "e.g. 1.0, to be applied to the example. If the weight is not " + "1.0, then any NnetIo objects with names matching \"output\" and any chain " + "supervision objects will have their weights multiplied by this " + "weight. In addition, the merging will keep distinct language-names " + "distinct, and will ensure that the output keys end in -xxxxxx " + "where xxxxxx is the language-name. This is intended to support " + "the \"chaina\" adaptation framework.") + +and please make any implementation changes required to support it. When +weighting chain supervision objects, just multiply the 'weight' field in the +ChainSupervision object. I think when weighting NnetIo objects you can just +scale the GeneralMatrix, although I'm not sure if there is a generic way to do +that. (This probably only really makes sense with sparse supervision intended +to represent posterior probabilities in xent setups). Do this before merging; I +believe the chain merging code already checks for weight equality but you'll +have to also make sure it checks for network-name equality and encodes the +network name in the output key. I believe the output keys are currently not +really inspected so back compatibility won't be important. Also please make +sure there is a convenience function that makes it easy to extract the "xxxxxx" +network-name suffix from a chain example key; this will be needed in the +training code. + + + + +==== + + + info needed + ?den.fst? + + frame_subsampling_factor1 + frame_subsampling_factor2 + frame_subsampling_factor = their product. + + + separately: different den.fst's? one den.fst? +==== + Multilingual case (training): + + bottom.raw english.ada english.mdl <-- output vs. output_libri, output_wsj. No, will be too complicated (?) + ... just support one name. + spanish.ada spanish.mdl + +0.ada top.mdl + + +when randomizing + +we'll merge in a controlled way, e.g. nnet3-merge-egs --fixed +=== + + --bottom-subsampling-factor is the subsampling in the bottom + model (the feature extractor). frame-subsampling-factor + divided by this is the amount of subsampling in the top + model. In the training code we'll work this out from + the 't' values in the chain supervision object, and + the top network will actually run at the reduced frame + rate. + --top-network-is-recurrent is true if the top network is + recurrent and therefore we need to keep as much extra + context as possible in the features. + +How to work out the computations: + + We get the number of n values and the first and last 't' values in the input; + check they are contiguous. + + We get the number of 't' values in the output (the chain supervision) and + their spacing; this is interpreted as the frame-subsampling-factor, which is + not passed directly to nnet3-chaina-train. + + We are given the --bottom-subsampling-factor and (boolean) + top_network_is_recurrent. + + We work out the left-context and right-context of the bottom and + top networks. We first use this on the top network to work out, at the + top-network frame rate, the 't' values needed at the input + (e.g. frames -10 through 159 assuming the chunk size is 150 and + the network takes +-10 frames of context). From 4568b4ce16f6fa9fa7c00a4204fd4b81d0b2af46 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 20 Dec 2018 13:18:04 -0800 Subject: [PATCH 38/87] [src] More progress on nnet3a training code. --- src/adapt/differentiable-transform-itf.h | 7 +- src/chain/chain-training.h | 9 +- src/nnet3/nnet-chain-training.cc | 4 +- src/nnet3/nnet-example-utils.cc | 4 +- src/nnet3a/Makefile | 2 +- src/nnet3a/nnet-chaina-training.cc | 616 ++++++++++++------ src/nnet3a/nnet-chaina-training.h | 788 +++++++++++++++++++++-- src/nnet3a/nnet-chaina-utils.h | 39 ++ src/nnet3a/notes.update | 57 +- 9 files changed, 1278 insertions(+), 248 deletions(-) create mode 100644 src/nnet3a/nnet-chaina-utils.h diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index e1b7e25c210..13983c8213f 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -49,6 +49,7 @@ class SpeakerStatsItf { }; + /** This class is for speaker-dependent feature-space transformations -- principally various varieties of fMLLR, including mean-only, diagonal and @@ -166,8 +167,8 @@ class DifferentiableTransform { of time frames. @param [in] num_spk The number of speakers. Must be greater than one, and must divide num_chunks. The number of chunks per speaker - (num_chunks / num_spk) must be the same for all speakers, and the - chunks for a speaker must be consecutive. + must be the same for all speakers (it will equal num_chunks / num_spk), + and the chunks for a speaker must be consecutively numbered. @param [in] posteriors (note: this is a vector of vector of pair). This provides, in 'soft-count' form, the class supervision information that is used for the @@ -205,7 +206,7 @@ class DifferentiableTransform { values. @param [in] minibatch_info The pointer returned by the corresponding call to TrainingForward() (may be NULL). This function - takes possession of the pointer. If for some reason the + takes ownership of the pointer. If for some reason the backward pass was not done, the caller will likely want to delete it themselves. @param [in,out] input_deriv The derivative at the input, i.e. diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 6ea70b5ca41..63e03c7e35f 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -99,7 +99,7 @@ struct ChainTrainingOptions { example; you'll want to divide it by 'tot_weight' before displaying it. @param [out] l2_term The l2 regularization term in the objective function, if - the --l2-regularize option is used. To be added to 'o + the --l2-regularize option is used (else will be set to 0.0). @param [out] weight The weight to normalize the objective function by; equals supervision.weight * supervision.num_sequences * supervision.frames_per_sequence. @@ -115,6 +115,10 @@ struct ChainTrainingOptions { peak memory use). xent_output_deriv will be used in the cross-entropy regularization code; it is also used in computing the cross-entropy objective value. + @param [out] numerator_post If non-NULL, then the posterior from the numerator + forward-backward will be written here (note: it won't be + scaled by the supervision weight). This is intended for + use in the adaptation framework used in "chaina" training. */ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, @@ -124,7 +128,8 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrix *xent_output_deriv = NULL); + CuMatrix *xent_output_deriv = NULL, + Posterior *numerator_post = NULL); diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index a798cb597f5..481f7989131 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -111,8 +111,8 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, this->ProcessOutputs(false, eg, &computer); computer.Run(); - // If relevant, add in the part of the gradient that comes from L2 - // regularization. + // If relevant, add in the part of the gradient that comes from + // parameter-level L2 regularization. ApplyL2Regularization(*nnet_, GetNumNvalues(eg.inputs, false) * nnet_config.l2_regularize_factor, diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index cc5fe3cc050..adbfae95794 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -140,7 +140,7 @@ static void MergeIo(const std::vector &src, // we could easily support merging already-merged egs, but I don't see a // need for it right now. KALDI_ASSERT(output_iter[i].n == 0 && - "Merging already-merged egs? Not currentlysupported."); + "Merging already-merged egs? Not currently supported."); output_iter[i].n = n; } this_offset += this_size; // note: this_offset is a reference. @@ -556,7 +556,7 @@ bool UtteranceSplitter::LengthsMatch(const std::string &utt, int32 length_tolerance) const { int32 sf = config_.frame_subsampling_factor, expected_supervision_length = (utterance_length + sf - 1) / sf; - if (std::abs(supervision_length - expected_supervision_length) + if (std::abs(supervision_length - expected_supervision_length) <= length_tolerance) { return true; } else { diff --git a/src/nnet3a/Makefile b/src/nnet3a/Makefile index 8cca3ea5a05..dfa01545af5 100644 --- a/src/nnet3a/Makefile +++ b/src/nnet3a/Makefile @@ -8,7 +8,7 @@ include ../kaldi.mk TESTFILES = -OBJFILES = +OBJFILES = nnet-chaina-training.o LIBNAME = kaldi-nnet3a diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index a798cb597f5..a506dd75855 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -1,4 +1,4 @@ -// nnet3/nnet-chain-training.cc +// nnet3/nnet-chaina-training.cc // Copyright 2015 Johns Hopkins University (author: Daniel Povey) // 2016 Xiaohui Zhang @@ -18,254 +18,506 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#include "nnet3/nnet-chain-training.h" #include "nnet3/nnet-utils.h" +#include "nnet3a/nnet-chaina-training.h" +#include "nnet3a/nnet-chaina-utils.h" namespace kaldi { namespace nnet3 { -NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, - const fst::StdVectorFst &den_fst, - Nnet *nnet): - opts_(opts), +NnetChainaTopTrainer::NnetChainaTopTrainer( + const std::string &lang_name, + const NnetChainaTrainingOptions &config, + const fst::StdVectorFst &den_fst, + const differentiable_transform::DifferentiableTransform &transform, + CachingOptimizingCompiler *compiler, + Nnet *nnet): + lang_name_(lang_name), + opts_(config), den_graph_(den_fst, nnet->OutputDim("output")), + transform_(transform), + compiler_(compiler), nnet_(nnet), - compiler_(*nnet, opts_.nnet_config.optimize_config, - opts_.nnet_config.compiler_config), + delta_nnet_(nnet->Copy()), num_minibatches_processed_(0), - srand_seed_(RandInt(0, 100000)) { - if (opts.nnet_config.zero_component_stats) - ZeroComponentStats(nnet); - KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 && - opts.nnet_config.max_param_change >= 0.0 && - opts.nnet_config.backstitch_training_interval > 0); - delta_nnet_ = nnet_->Copy(); - ScaleNnet(0.0, delta_nnet_); + num_max_change_global_applied_si_(0), + num_max_change_global_applied_(0) { + const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); num_max_change_per_component_applied_.resize(num_updatable, 0); - num_max_change_global_applied_ = 0; - - if (opts.nnet_config.read_cache != "") { - bool binary; - try { - Input ki(opts.nnet_config.read_cache, &binary); - compiler_.ReadCache(ki.Stream(), binary); - KALDI_LOG << "Read computation cache from " << opts.nnet_config.read_cache; - } catch (...) { - KALDI_WARN << "Could not open cached computation. " - "Probably this is the first training iteration."; - } + num_max_change_per_component_applied_si_.resize(num_updatable, 0); + + if (opts_.nnet_config.zero_component_stats) + ZeroComponentStats(nnet); + + ScaleNnet(0.0, delta_nnet_); + if (opts_.nnet_config.read_cache != "") { + // It would be complicated to implement, as there are various top nnets + // and they would all try to read and write the same cache files. + // To implement this, the best way would be to + KALDI_WARN << "The read-cache options are not currently supported."; } + KALDI_ASSERT(opts_.nnet_config.momentum >= 0.0 && + opts_.nnet_config.max_param_change >= 0.0); } -void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { - bool need_model_derivative = true; - const NnetTrainerOptions &nnet_config = opts_.nnet_config; - bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0); - ComputationRequest request; - GetChainComputationRequest(*nnet_, chain_eg, need_model_derivative, - nnet_config.store_component_stats, - use_xent_regularization, need_model_derivative, - &request); - std::shared_ptr computation = compiler_.Compile(request); - - if (nnet_config.backstitch_training_scale > 0.0 && num_minibatches_processed_ - % nnet_config.backstitch_training_interval == - srand_seed_ % nnet_config.backstitch_training_interval) { - // backstitch training is incompatible with momentum > 0 - KALDI_ASSERT(nnet_config.momentum == 0.0); - FreezeNaturalGradient(true, delta_nnet_); - bool is_backstitch_step1 = true; - srand(srand_seed_ + num_minibatches_processed_); - ResetGenerators(nnet_); - TrainInternalBackstitch(chain_eg, *computation, is_backstitch_step1); - FreezeNaturalGradient(false, delta_nnet_); // un-freeze natural gradient - is_backstitch_step1 = false; - srand(srand_seed_ + num_minibatches_processed_); - ResetGenerators(nnet_); - TrainInternalBackstitch(chain_eg, *computation, is_backstitch_step1); - } else { // conventional training - TrainInternal(chain_eg, *computation); - } - if (num_minibatches_processed_ == 0) { +/** + TODO: include this somewhere. + if (num_minibatches_processed_ == 0) { ConsolidateMemory(nnet_); ConsolidateMemory(delta_nnet_); } - num_minibatches_processed_++; +*/ + + +std::shared_ptr NnetChainaTopTrainer::GetComputation( + const ComputationStructure &s) { + { + auto iter = computation_map_.find(s); + if (iter != computation_map_.end()) + return iter->second; + } + int32 num_sequences = s.num_sequences, + frames_per_sequence_in = s.frames_per_sequence_in, + frames_per_sequence_out = s.frames_per_sequence_out, + first_input_t = s.first_input_t, + first_output_t = 0, + top_subsampling_factor = s.top_subsampling_factor; + + ComputationRequest request; + request.need_model_derivative = opts_.train_top_nnet; + + request.store_component_stats = true; + request.inputs.resize(1); + request.inputs[0].name = "input"; + request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); + request.inputs[0].has_deriv = s.need_input_deriv; + // The inputs are in the order: all frames of sequence 0; then all frames of + // sequence 1; and so on. This is done + auto iter = request.inputs[0].indexes.begin(); + for (int32 n = 0; n < num_sequences; n++) { + for (int32 t = first_input_t; + t < first_input_t + frames_per_sequence_in; ++t,++iter) { + iter->n = n; + iter->t = t; + } + } + // ... but the outputs are in the order: the first frame of all sequences; + // the second frame of all sequences; and so on. + request.outputs.resize(2); + request.outputs[0].name = (s.adapted ? "output" : "output-si"); + request.outputs[0].has_deriv = true; + request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences); + int32 t_stride_out = top_subsampling_factor; + iter = request.outputs[0].indexes.begin(); + for (int32 t = first_output_t; + t < first_output_t + frames_per_sequence_out * t_stride_out; + t += t_stride_out) { + for (int32 n = 0; n < num_sequences; ++n,++iter) { + iter->n = n; + iter->t = t; + } + } + request.outputs[1].has_deriv = true; + request.outputs[1].name = (s.adapted ? "output-xent" : "output-xent-si"); + request.outputs[1].indexes = request.outputs[0].indexes; + std::shared_ptr computation = compiler_->Compile( + request); + computation_map_[s] = computation; + return computation; } -void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, - const NnetComputation &computation) { +bool NnetChainaTopTrainer::TrainUnadapted( + const CuMatrixBase &input, + const NnetComputation &computation, + const chain::Supervision &supervision, + const CuVectorBase &deriv_weights, + Posterior *posterior, + CuMatrixBase *input_deriv) { + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the // constructor of 'computer', it will use that copy of the nnet to // store stats. NnetComputer computer(nnet_config.compute_config, computation, nnet_, delta_nnet_); + // Freeze the natural gradient. We dont want to update the NG scatter + // matrices on this data because we'll next be running the same nnet on the + // speaker-adapted version of the same data, and it would violate the + // independence assumptions needed for NG to work if we updated them. + FreezeNaturalGradient(true, delta_nnet_); + // give the inputs to the computer object. - computer.AcceptInputs(*nnet_, eg.inputs); + CuMatrix input_copy(input); + computer.AcceptInput("input", &input_copy); computer.Run(); - this->ProcessOutputs(false, eg, &computer); - computer.Run(); + const CuMatrixBase + &output = computer.GetOutput("output-si"), + &output_xent = computer.GetOutput("output-si-xent"); + CuMatrix output_deriv(output.NumRows(), + output.NumCols(), + kUndefined), + output_xent_deriv; + + // Note: we don't normally use the l2 term any more, parameter-level + // regularization seems to work better. + BaseFloat tot_objf, tot_l2_term, tot_weight; + + ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, + supervision, output, + &tot_objf, &tot_l2_term, &tot_weight, + &output_deriv, &output_xent_deriv, + posterior); + + { + // this block computes and keeps track of the cross-entropy objective. + // at this point, xent_deriv is posteriors derived from the numerator + // computation. note, xent_objf has a factor of '.supervision.weight' + BaseFloat xent_objf = TraceMatMat(output_xent, output_xent_deriv, kTrans); + output_si_xent_objf_.UpdateStats(lang_name_ + ":output-si-xent", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, xent_objf); + } - // If relevant, add in the part of the gradient that comes from L2 - // regularization. - ApplyL2Regularization(*nnet_, - GetNumNvalues(eg.inputs, false) * - nnet_config.l2_regularize_factor, - delta_nnet_); + if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) { + output_deriv.MulRowsVec(deriv_weights); + output_xent_deriv.MulRowsVec(deriv_weights); + } - // Updates the parameters of nnet - bool success = UpdateNnetWithMaxChange(*delta_nnet_, - nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + if (opts_.unadapted_deriv_scale != 1.0) + output_deriv.Scale(opts_.unadapted_deriv_scale); - // Scale down the batchnorm stats (keeps them fresh... this affects what - // happens when we use the model with batchnorm test-mode set). - ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + computer.AcceptInput("output-si", &output_deriv); - // The following will only do something if we have a LinearComponent - // or AffineComponent with orthonormal-constraint set to a nonzero value. - ConstrainOrthonormal(nnet_); + output_xent_deriv.Scale(opts_.chain_config.xent_regularize * + opts_.unadapted_deriv_scale); + computer.AcceptInput("output-si-xent", &output_xent_deriv); + + output_si_objf_.UpdateStats(lang_name_ + ":output-si", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, tot_l2_term); - // Scale delta_nnet - if (success) + // Do the backprop. We know we're either updating the nnet or need the + // input derivatives (else, what point is there in training), so there + // must be a backprop pass. + computer.Run(); + + if (input_deriv != NULL) { + input_deriv->AddMat(opts_.unadapted_backprop_scale, + computer.GetOutput("input")); + } + + // Updates the parameters of nnet. Since the derivatives will all be scaled + // with "unadapted_deriv_scale" it makes sense to apply that same factor to + // the max-change, to keep the max-change in proportion with how much we + // expect the net to change (so smaller max-change values don't lead to more + // emphasize on the unadapted model's derivatives) + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + opts_.unadapted_deriv_scale, + 1.0 - nnet_config.momentum, // normally momentum is 0.0. + nnet_, + &num_max_change_per_component_applied_si_, + &num_max_change_global_applied_si_); + + // Un-freeze the natural gradient. + FreezeNaturalGradient(false, delta_nnet_); + + if (!success) ScaleNnet(nnet_config.momentum, delta_nnet_); else ScaleNnet(0.0, delta_nnet_); + return success; } -void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, - const NnetComputation &computation, - bool is_backstitch_step1) { +bool NnetChainaTopTrainer::TrainAdapted( + const CuMatrixBase &input, + const NnetComputation &computation, + const chain::Supervision &supervision, + const CuVectorBase &deriv_weights, + CuMatrixBase *input_deriv) { + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the // constructor of 'computer', it will use that copy of the nnet to // store stats. NnetComputer computer(nnet_config.compute_config, computation, nnet_, delta_nnet_); + // give the inputs to the computer object. - computer.AcceptInputs(*nnet_, eg.inputs); + CuMatrix input_copy(input); + computer.AcceptInput("input", &input_copy); computer.Run(); - bool is_backstitch_step2 = !is_backstitch_step1; - this->ProcessOutputs(is_backstitch_step2, eg, &computer); + const CuMatrixBase + &output = computer.GetOutput("output"), + &output_xent = computer.GetOutput("output-xent"); + CuMatrix output_deriv(output.NumRows(), + output.NumCols(), + kUndefined), + output_xent_deriv; + + // Note: we don't normally use the l2 term any more, parameter-level + // regularization seems to work better. + BaseFloat tot_objf, tot_l2_term, tot_weight; + + ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, + supervision, output, + &tot_objf, &tot_l2_term, &tot_weight, + &output_deriv, &output_xent_deriv); + + { + // this block computes and keeps track of the cross-entropy objective. + // at this point, xent_deriv is posteriors derived from the numerator + // computation. note, xent_objf has a factor of '.supervision.weight' + BaseFloat xent_objf = TraceMatMat(output_xent, output_xent_deriv, kTrans); + output_xent_objf_.UpdateStats(lang_name_ + ":output-xent", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, xent_objf); + } + + if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) { + output_deriv.MulRowsVec(deriv_weights); + output_xent_deriv.MulRowsVec(deriv_weights); + } + + computer.AcceptInput("output", &output_deriv); + output_xent_deriv.Scale(opts_.chain_config.xent_regularize); + computer.AcceptInput("output-xent", &output_xent_deriv); + + output_objf_.UpdateStats(lang_name_ + ":output", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, tot_l2_term); + + if (input_deriv == NULL && !opts_.train_top_nnet) { + // We're neither training the top model nor need the input derivatives. + // E.g., we might be just getting stats for batch normalization after + // training the model. + return true; + } + + // Do the backprop. We know we're either updating the nnet or need the + // input derivatives (else, what point is there in training), so there + // must be a backprop pass. computer.Run(); - BaseFloat max_change_scale, scale_adding; - if (is_backstitch_step1) { - // max-change is scaled by backstitch_training_scale; - // delta_nnet is scaled by -backstitch_training_scale when added to nnet; - max_change_scale = nnet_config.backstitch_training_scale; - scale_adding = -nnet_config.backstitch_training_scale; - } else { - // max-change is scaled by 1 + backstitch_training_scale; - // delta_nnet is scaled by 1 + backstitch_training_scale when added to nnet; - max_change_scale = 1.0 + nnet_config.backstitch_training_scale; - scale_adding = 1.0 + nnet_config.backstitch_training_scale; - // If relevant, add in the part of the gradient that comes from L2 - // regularization. It may not be optimally inefficient to do it on both - // passes of the backstitch, like we do here, but it probably minimizes - // any harmful interactions with the max-change. - ApplyL2Regularization(*nnet_, - 1.0 / scale_adding * GetNumNvalues(eg.inputs, false) * - nnet_config.l2_regularize_factor, delta_nnet_); + if (input_deriv != NULL) { + input_deriv->AddMat(1.0, computer.GetOutput("input")); } - // Updates the parameters of nnet - UpdateNnetWithMaxChange(*delta_nnet_, - nnet_config.max_param_change, max_change_scale, scale_adding, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + // Updates the parameters of nnet. Since the derivatives will all be scaled + // with "unadapted_deriv_scale" it makes sense to apply that same factor to + // the max-change, to keep the max-change in proportion with how much we + // expect the net to change (so smaller max-change values don't lead to more + // emphasize on the unadapted model's derivatives) + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + opts_.unadapted_deriv_scale, + 1.0 - nnet_config.momentum, // normally momentum is 0.0. + nnet_, + &num_max_change_per_component_applied_si_, + &num_max_change_global_applied_si_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). + // Note: we don't do this for the unadapted pass, it would be redundant + // (although of course doing it only once changes the interpretation + // of the scale slightly). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); - if (is_backstitch_step1) { - // The following will only do something if we have a LinearComponent or - // AffineComponent with orthonormal-constraint set to a nonzero value. We - // choose to do this only on the 1st backstitch step, for efficiency. - ConstrainOrthonormal(nnet_); + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + + if (!success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); + return success; +} + + +bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, + int32 num_sequences, + int32 num_spk, + int32 first_input_t, + int32 top_subsampling_factor, + const VectorBase &deriv_weights_in, + const chain::Supervision &supervision, + CuMatrixBase *input_deriv) { + KALDI_ASSERT(input.NumRows() != 0 && input.NumRows() % num_sequences != 0); + int32 frames_per_sequence_in = input.NumRows() / num_sequences, + frames_per_sequence_out = supervision.frames_per_sequence; + + bool adapted = false; + ComputationStructure structure( + adapted, (input_deriv != NULL), + num_sequences, frames_per_sequence_in, frames_per_sequence_out, + first_input_t, top_subsampling_factor); + + Posterior post; + + CuVector deriv_weights(deriv_weights_in); + + std::shared_ptr computation_unadapted = + GetComputation(structure); + if (!TrainUnadapted(input, *computation_unadapted, supervision, + deriv_weights, &post, input_deriv)) { + num_minibatches_processed_++; + if (input_deriv) + input_deriv->SetZero(); + return false; } - if (!is_backstitch_step1) { - // Scale down the batchnorm stats (keeps them fresh... this affects what - // happens when we use the model with batchnorm test-mode set). Do this - // after backstitch step 2 so that the stats are scaled down before we start - // the next minibatch. - ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + Posterior post_padded(input.NumRows()); + ConvertPosterior(post, num_sequences, first_input_t, + top_subsampling_factor, &post_padded); + + structure.adapted = true; + std::shared_ptr computation_adapted = + GetComputation(structure); + + CuMatrix adapted_input(input.NumRows(), input.NumCols(), + kUndefined), + adapted_input_deriv(input.NumRows(), input.NumCols()); + + using namespace differentiable_transform; + MinibatchInfoItf *minibatch_info = transform_.TrainingForward( + input, num_sequences, num_spk, post_padded, &adapted_input); + + if (!TrainAdapted(adapted_input, *computation_adapted, supervision, + deriv_weights, &adapted_input_deriv)) { + num_minibatches_processed_++; + if (input_deriv) + input_deriv->SetZero(); + return false; } - ScaleNnet(0.0, delta_nnet_); + if (input_deriv == NULL) { + delete minibatch_info; + } else { + transform_.TrainingBackward(input, adapted_input_deriv, + num_sequences, num_spk, post_padded, + minibatch_info, input_deriv); + } + num_minibatches_processed_++; + return true; } -void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, - const NnetChainExample &eg, - NnetComputer *computer) { - // normally the eg will have just one output named 'output', but - // we don't assume this. - // In backstitch training, the output-name with the "_backstitch" suffix is - // the one computed after the first, backward step of backstitch. - const std::string suffix = (is_backstitch_step2 ? "_backstitch" : ""); - std::vector::const_iterator iter = eg.outputs.begin(), - end = eg.outputs.end(); - for (; iter != end; ++iter) { - const NnetChainSupervision &sup = *iter; - int32 node_index = nnet_->GetNodeIndex(sup.name); - if (node_index < 0 || - !nnet_->IsOutputNode(node_index)) - KALDI_ERR << "Network has no output named " << sup.name; - - const CuMatrixBase &nnet_output = computer->GetOutput(sup.name); - CuMatrix nnet_output_deriv(nnet_output.NumRows(), - nnet_output.NumCols(), - kUndefined); - - bool use_xent = (opts_.chain_config.xent_regularize != 0.0); - std::string xent_name = sup.name + "-xent"; // typically "output-xent". - CuMatrix xent_deriv; - - BaseFloat tot_objf, tot_l2_term, tot_weight; - - ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, - sup.supervision, nnet_output, - &tot_objf, &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); - - if (use_xent) { - // this block computes the cross-entropy objective. - const CuMatrixBase &xent_output = computer->GetOutput( - xent_name); - // at this point, xent_deriv is posteriors derived from the numerator - // computation. note, xent_objf has a factor of '.supervision.weight' - BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); - objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, - opts_.nnet_config.print_interval, - num_minibatches_processed_, - tot_weight, xent_objf); - } - if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) { - CuVector cu_deriv_weights(sup.deriv_weights); - nnet_output_deriv.MulRowsVec(cu_deriv_weights); - if (use_xent) - xent_deriv.MulRowsVec(cu_deriv_weights); - } +NnetComputer* NnetChainaBottomTrainer::Forward( + int32 num_sequences, + int32 first_input_t, + int32 first_output_t, + int32 frames_per_sequence_out, + CuMatrix *input, + CuMatrix *output) { + KALDI_ASSERT(input->NumRows() != 0 && input->NumRows() % num_sequences == 0); + int32 frames_per_sequence_in = input->NumRows() / num_sequences; + ComputationStructure s(opts_.train_bottom_nnet, + num_sequences, + frames_per_sequence_in, + frames_per_sequence_out, + first_input_t, first_output_t); + std::shared_ptr computation = GetComputation(s); + + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + NnetComputer *computer = new NnetComputer(nnet_config.compute_config, + computation, nnet_, delta_nnet_); + computer.AcceptInput("input", input); + computer.Run(); + computer.GetOutputDestructive("output", output); + return computer; +} + + +void NnetChainaBottomTrainer::Backward(NnetComputer *computer, + CuMatrix *output_deriv) { + computer->AcceptInput("output", output_deriv); + computer->Run(); - computer->AcceptInput(sup.name, &nnet_output_deriv); + // TODO. - objf_info_[sup.name + suffix].UpdateStats(sup.name + suffix, - opts_.nnet_config.print_interval, - num_minibatches_processed_, - tot_weight, tot_objf, tot_l2_term); + // Updates the parameters of nnet. Since the derivatives will all be scaled + // with "unadapted_deriv_scale" it makes sense to apply that same factor to + // the max-change, to keep the max-change in proportion with how much we + // expect the net to change (so smaller max-change values don't lead to more + // emphasize on the unadapted model's derivatives) + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + opts_.unadapted_deriv_scale, + 1.0 - nnet_config.momentum, // normally momentum is 0.0. + nnet_, + &num_max_change_per_component_applied_si_, + &num_max_change_global_applied_si_); - if (use_xent) { - xent_deriv.Scale(opts_.chain_config.xent_regularize); - computer->AcceptInput(xent_name, &xent_deriv); +} + +std::shared_ptr NnetChainaBottomTrainer::GetComputation( + const ComputationStructure &s) { + { + auto iter = computation_map_.find(s); + if (iter != computation_map_.end()) + return iter->second; + } + int32 num_sequences = s.num_sequences, + frames_per_sequence_in = s.frames_per_sequence_in, + frames_per_sequence_out = s.frames_per_sequence_out, + first_input_t = s.first_input_t, + first_output_t = s.first_output_t; + + ComputationRequest request; + request.need_model_derivative = train_bottom_model_; + request.store_component_stats = true; + request.inputs.resize(1); + request.inputs[0].name = "input"; + request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); + // The inputs are in the order: all frames of sequence 0; then all frames of + // sequence 1; and so on. This is done + auto iter = request.inputs[0].indexes.begin(); + for (int32 n = n < num_sequences; n++) { + for (int32 t = first_input_t; + t < first_input_t + frames_per_sequence_in; ++t,++iter) { + iter->n = n; + iter->t = t; } } + // ... but the outputs are in the order: the first frame of all sequences; + // the second frame of all sequences; and so on. + request.outputs.resize(1); + request.outputs[0].name = "output"; + request.outputs[1].has_deriv = train_bottom_model_; + request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences); + int32 t_stride_out = bottom_subsampling_factor_; + iter = request.outputs[0].indexes.begin(); + for (int32 t = first_output_t; + t < first_output_t + frames_per_sequence_out * t_stride_out; + t += t_stride_out) { + for (int32 n = n < num_sequences; ++n,++iter) { + iter->n = n; + iter->t = t; + } + } + std::shared_ptr computation = compiler_.Compile( + request); + computation_map_[s] = computation; + return computation; } + bool NnetChainTrainer::PrintTotalStats() const { unordered_map::const_iterator iter = objf_info_.begin(), diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index bc11212451e..aaad4858979 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -17,8 +17,8 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#ifndef KALDI_NNET3_NNET_CHAIN_TRAINING_H_ -#define KALDI_NNET3_NNET_CHAIN_TRAINING_H_ +#ifndef KALDI_NNET3_NNET_CHAINA_TRAINING_H_ +#define KALDI_NNET3_NNET_CHAINA_TRAINING_H_ #include "nnet3/nnet-example.h" #include "nnet3/nnet-computation.h" @@ -26,50 +26,588 @@ #include "nnet3/nnet-optimize.h" #include "nnet3/nnet-chain-example.h" #include "nnet3/nnet-training.h" +#include "nnet3/am-nnet-simple.h" #include "chain/chain-training.h" #include "chain/chain-den-graph.h" +#include "adapt/differentiable-transform-itf.h" namespace kaldi { namespace nnet3 { - - struct NnetChainaTrainingOptions { - NnetTrainerOptions bottom_nnet_config; - NnetTrainerOptions top_nnet_config; - bool train_bottom_nnet; // True if we will be training the bottom nnet. - bool train_top_nnet; // True if we will be training the top nnet. Either - // this or train_bottom_nnet must be true (else, what - // are we doing here?) - + NnetTrainerOptions nnet_config; chain::ChainTrainingOptions chain_config; bool apply_deriv_weights; - NnetChainTrainingOptions(): apply_deriv_weights(true) { } + BaseFloat unadapted_deriv_scale; + BaseFloat unadapted_backprop_scale; + bool train_bottom_nnet; // True if we will be training the bottom nnet. + bool train_top_nnet; // True if we will be training the top nnet. + int32 bottom_subsampling_factor; + bool keep_embedding_context; + + NnetChainaTrainingOptions(): + apply_deriv_weights(true), + unadapted_deriv_scale(0.5), + unadapted_backprop_scale(1.0), + train_bottom_nnet(true), + train_top_nnet(true), + bottom_subsampling_factor(1), + keep_embedding_context(true) { } void Register(OptionsItf *opts) { - // register bottom_nnet_config with the prefix bottom - ParseO nnet_config.Register(opts); chain_config.Register(opts); + opts->Register("train-bottom-nnet", &train_bottom_nnet, + "Set this to false to disable training of bottom nnet."); + opts->Register("train-top-nnet", &train_top_nnet, + "Set this to false to disable training of top nnet."); + opts->Register("bottom-subsampling-factor", &bottom_subsampling_factor, + "Determines the frequency at which we subsample the " + "embeddings from the bottom nnet. Implicitly, the " + "subsampling factor in the top nnet is the overall " + "--frame-subsampling-factor (determined when we dumped " + "the egs) divided by this value."); + opts->Register("keep-embedding-context", &keep_embedding_context, + "If true, we compute as much left/right context of the " + "embedding vectors (the output of the bottom nnet) as is " + "possible given the provided input features in the eg. " + "You'll generally only want this to be true " + "if the top network is recurrent or otherwise has " + "optional dependencies (for example: if it uses " + "StatisticsExtractionComponent, IfDefined(), Failover(), " + "etc.)."); opts->Register("apply-deriv-weights", &apply_deriv_weights, "If true, apply the per-frame derivative weights stored with " "the example"); + opts->Register("unadapted-deriv-scale", &unadapted_deriv_scale, + "Scale on the derivatives (and max-change values, for the top " + "nnet) for the unadapted branches of the nnets (at the outputs " + "output-si and output-si-xent. Affects how strongly the nnets " + "are trained by the unadapted embeddings. Note: this also " + "affects the derivatives given to the bottom nnet. The scale " + "on the adapted branch is implicitly 1.0."); + opts->Register("unadapted-backprop-scale", &unadapted_backprop_scale, + "Scale that is applied to the derivatives arising from the " + "unadapted branch of the top nnets, when backpropagating " + "to the embeddings. Affects how much we prioritize the " + "unadapted features. Note: this is effectively multiplied by " + "unadapted-deriv-scale; unadapted-deriv-scale also affects " + "training of the top nnet."); + } + void Check() { + KALDI_ASSERT(unadapted_deriv_scale > 0.0 && + unadapted_backprop_scale >= 0.0); + // TODO: add more checks? } + }; /** - This class is for single-threaded training of neural nets using the 'chain' - model. + This struct, intended mostly to be accessed by NnetChainaTrainer, handles the + logic of reading the models and their corresponding denominator FSTs from + disk, and of writing out the corresponding (raw) trained models when + this iteration of training has finished. + + The reason this is not entirely trivial is that we want to make it easy + to support the multilingual case. In this case there is one 'bottom' + model (the embedding extractor) but there may be multiple 'top' models, + each with their associated transition model and denominator FST, and their + own name. We use a directory to organize these. + */ +class NnetChainaModels { + public: + /** + Constructor to which you pass the model directory and the den-fst + directory. The directory structure is: + /bottom.raw + should exist, and then for each language name "lang", the following + files should exist: + /lang.mdl /lang.fst /lang.ada + + In practice, the language name will be either "default", in the + typical (monolingual) setup, or it might be arbitrary strings + representing languages such as "english", "french" (in + + In general the language can be any string containing ASCII letters, numbers + or underscores, and it will be a suffix of the key in the egs that we are + reading, separated from them by a "-". E.g. if the key is + "143213423-1234123432_10-english", the language would be "english". + The models and denominator FSTs will only be read when they are + actually required. + */ + NnetChainaModels(const std::string &model_dir, + const std::string &den_fst_dir, + const std::string &transform_dir); + + Nnet* GetBottomNnet(); + + int32 BottomNnetLeftContext() const; + int32 BottomNnetRightContext() const; + + /** + Returns the AmNnetSimple object corresponding to a given language + name (e.g. "default", "english", "french"). Note: the model + file /.mdl will contain a TransitionModel and an + AmNnetSimple object + */ + AmNnetSimple *GetNnetForLang(const std::string &language_name); + + + const TransitionModel *GetTransitionModelForLang( + const std::string &language_name); + + + fst::StdVectorFst *GetDenFstForLang(const std::string &language_name); + + // This convenience function returns the Nnet object in the + // AmNnetSimple object returned by 'GetNnetForLang'. + Nnet *GetRawNnetForLang(const std::string &language_name); + + differentiable_transform::DifferentiableTransform *GetTransformForLang( + const std::string &language_name); + + + // Writes to 'langs' a vector (in no particular order) of the + // names of all the languages that have been loaded (this will depend + // on whether they were represented in the egs). This might + // be [ "default" ], or it might be [ "english", "french" ], for + // example. + void ListAllLanguages(std::vector *langs); + + // Writes the files + // /bottom..raw + // and, for each language that we accessed, + // /..raw + void WriteRawModels(const std::string &model_out_dir, + int32 job_id); + + ~NnetChainaModels(); + private: + // Directory where models are located. + std::string model_dir_; + // Directory where denominator FSTs are located. + std::string den_fst_dir_; + // Directory where transforms (type: DifferentiableTransform) are located. + std::string transform_dir_; + + // This corresponds to /bottom.raw. + Nnet bottom_nnet_; + // The left and right context of bottom_nnet_. + int32 bottom_nnet_left_context_; + int32 bottom_nnet_right_context_; + + // Data that is loaded per language. + + struct LanguageInfo { + // trans_model and am_nnet come from /.mdl + TransitionModel trans_model; + AmNnetSimple am_nnet; + // den_fst comes from /.fst + fst::StdVectorFst den_fst; + // trans comes from /.ada + differentiable_transform::DifferentiableTransform *trans; + }; + + std::unordered_map lang_info_; + +}; + +/** + steps of training: + + for a minibatch: + work out the language + work out how many chunks per speaker + work out the context and how many frames of embeddings are + needed. + + See whether we need backprop and model update for the two + passes of training. + Make the 3 computations. + + + + We need + + */ + + +/** + This object, which has a similar function to NnetChainTrainer, trains the + 'top' model for a single language and (optionally) outputs the derivatives + required to obtain the 'bottom' model. + */ +class NnetChainaTopTrainer { + public: + /** + Constructor. + @param [in] lang_name The name of the language this corresponds to (for diagnostics). + E.g. "default", "english", etc. + @param [in] config Options class + @param [in] train_top_model True if we are training the 'top' model... this is one + configuration value that's outside 'config', that we need. + @param [in] den_fst The denominator FST for this language + @param [in] transform The transform object which will be used to produce adapted + features after the first pass of training. + @param [in] compiler A pointer to the compiler we are to use (we make it + owned externally for easier caching). + @param [in,out] nnet The neural net we are training. Expected to have outputs + called "output-si" (speaker-independent output), "output", + "output-si-xent", "output-xent", and an input called + "input". This class does not take ownership of the pointer. + */ + NnetChainaTopTrainer(const std::string &lang_name, + const NnetChainaTrainingOptions &config, + const fst::StdVectorFst &den_fst, + const differentiable_transform::DifferentiableTransform &transform, + CachingOptimizingCompiler *compiler, + Nnet *nnet); + + /** Train on one minibatch. + @param [in] input The input (unadapted) features, most likely the embeddings + that are the output of the 'bottom' nnet. Assumed to form a + regular grid with the 't' value having higher stride, so the + first 'num_sequences' rows would correspond to the + lowest-numbered frames for all sequences, and so on. + @param [in] num_sequences The number of sequences/chunks represented + in 'input' (a.k.a. the minibatch size). Actually this must + be equal to supervision.num_sequences, but it's easier for + reasons of clarity and documentation repeat it here. + @param [in] num_spk The total number of speakers. Must be >1, and must divide + num_sequences. The number of sequences per speaker + must be the same for all speakers (it will equal num_sequences / num_spk), + and the sequences for a speaker must be consecutively numbered. + @param [in] first_input_t The 't' value corresponding to the first input + frame (will normally be a negative number, corresponding to the left + context we are giving to the 'top' model, since we assume that the + sequences have 't' values starting from 0). The 't' values at + the input will be consecutive, and the number of frames per sequence + will equal input.NumRows() / num_sequences. Note: if the embeddings + are computed at a lower frame rate than the original features, we + renumber things to make the embeddings consecutive. + @param [in] top_subsampling_factor The subsampling factor of the top network + (which will equal the frame subsampling factor implicit in the original + egs that we read, divided by bottom_subsampling_factor). E.g. this + might frequently be 1 or 3. The frames at the output of the 'top' + nnet are evaluated for 't' values that are multiples of + 'top_subsampling_factor', starting from t=0. + @param [in] supervision The chain supervision object representing the objective + function at the output. Its num_sequences must equal the + num_sequences passed into this function separately. + @param [out] input_deriv If non-NULL, the derivative of the objective function + w.r.t. the input features will be written to here (this function assumes + that its value is zero on entry). + @return Returns true if it successfully trained on this minbiatch, false + on error (e.g. if a NaN was generated, which should not really happen). + */ + bool Train(const CuMatrixBase &input, + int32 num_sequences, + int32 num_spk, + int32 first_input_t, + int32 top_subsampling_factor, + const VectorBase &deriv_weights, + const chain::Supervision &supervision, + CuMatrixBase *input_deriv = NULL); + + // Prints out the final stats, and return true if there was a nonzero count. + bool PrintTotalStats() const; + + // Prints out the max-change stats (if nonzero): the percentage of time that + // per-component max-change and global max-change were enforced. + void PrintMaxChangeStats() const; + + ~NnetChainaTopTrainer(); + private: + + // We use this as an index with which to look up computations, kind of like a + // lookaside buffer; it avoids creating a much larger structure with large + // vectors of Indexes in it. + struct ComputationStructure { + bool adapted; + bool need_input_deriv; + int32 num_sequences; + int32 frames_per_sequence_in; + int32 frames_per_sequence_out; + int32 first_input_t; + int32 top_subsampling_factor; + inline bool operator == (const ComputationStructure &other) const { + return adapted == other.adapted && + need_input_deriv == other.need_input_deriv && + num_sequences == other.num_sequences && + frames_per_sequence_in == other.frames_per_sequence_in && + frames_per_sequence_out == other.frames_per_sequence_out && + first_input_t == other.first_input_t && + top_subsampling_factor == other.top_subsampling_factor; + }; + ComputationStructure (const ComputationStructure &other) = default; + ComputationStructure &operator = ( + const ComputationStructure &other) = default; + /** + Constructor. + @param [in] adapted True if we want the outputs from "output" and + "output-xent", and false if we want the outputs from + "output-si" and "output-si-xent". + @param [in] need_input_deriv True if we need the derivative w.r.t. + the features that are the input to this computation. + @param [in] num_sequences The number of sequences in this minibatch + (a.k.a. the minibatch size). + @param [in] frames_per_sequence_in The number of frames for each sequence + of input features. They are assumed to be consecutively + numbered. + @param [in] frames_per_sequence_out The 'frames_per_sequence' in + the ChainSupervision object, i.e. the length of the + output sequences of the computation. + @param [in] first_input_t The first 't' value in the input + sequence; will normally be negative (corresponding to + the negative of the number of frames of left context). + @param [in] top_subsampling_factor Frame subsampling factor at the + output; e.g., 3 would mean we are evaluating the output + at frames t=0, t=3, and so on. + */ + ComputationStructure(bool adapted, + bool need_input_deriv, + int32 num_sequences, + int32 frames_per_sequence_in, + int32 frames_per_sequence_out, + int32 first_input_t, + int32 top_subsampling_factor); + }; + struct ComputationHasher { + inline size_t operator() (const ComputationStructure &s) const { + return size_t(s.num_sequences) + + 10 * size_t(s.frames_per_sequence_in) + + 100 * size_t(s.frames_per_sequence_out) + + 1000 * size_t(s.first_input_t) + + 10000 * size_t(s.top_subsampling_factor); + } + }; + + // This is a faster lookup mechanism for the computation than + // is provided by the compiler's inherent caching. + std::unordered_map, + ComputationHasher> computation_map_; + + // This wraps the call to the compiler. See constructor + // of struct ComputationStructure for more documentation. + std::shared_ptr GetComputation( + const ComputationStructure &s); + + + /** + This does the training on the unadapted branch ("si" / speaker-independent) + of the neural net. + @param [in] input The input features, as supplied to Train(). Order + of rows is: the first frame of all sequences; the + second frame of all sequences; and so on. + @param [in] computation The computation corresponding to the unadapted + branch of the nnet. + @param [in] supervision The chain supervision object. The nnet output + dimensions are worked out from this, as well as + using this object to compute the objective function. + @param [in] deriv_weights Weights to be applied to the derivatives for the + corresponding frames of the output (order is: + first frame for all sequences; second frame for + all sequences, etc.). May be stored with the + egs. If this is the empty vector or + --apply-deriv-weights=false, they won't be + appplied. + @param [out] posterior The posteriors from the numerator forward-backward + on the adaptation model will be written to here. + The number of frames will be the number of frames in + the output sequence (supervision.frames_per_sequence), + and the order is: all sequences' frame 0; then all + sequences' frame 1; and so on. + @param [out] input_deriv Derivative w.r.t. the input features; this will + be added to, if it is not NULL. This function + applies the scale opts_.unadapted_backprop_weight + after adding this derivative to it. (The scale + opts_.unadapted_backprop_scale is implicitly + included already as we already scaled the objf + derivatives). + @return Returns true if the training went through successfully + (it should very rarely return false, e.g. if a NaN was generated). + */ + bool TrainUnadapted(const CuMatrixBase &input, + const NnetComputation &computation, + const chain::Supervision &supervision, + const CuVectorBase &deriv_weights, + Posterior *posterior, + CuMatrixBase *input_deriv); + + /** + Converts the format of the posterior from how it is at the output of the + network to how it is at the input (i.e. in the embedding space). + Basically, this will consist of padding with empty posteriors for the + "context frames", and possibly upsampling the posteriors (by just repeating + each one for, say, 3 frames, if top_subsampling_factor == 3). + + The number of frames per sequence at the output will equal + post_at_output.size() / num_sequences, and the number of frames per + sequence at the input will equal post_at_inptu->size() / num_sequences + (note: this means 'post_at_input is expected to be appropriately sized + when this function is called). + */ + void ConvertPosterior(const Posterior &post_at_output, + int32 num_sequences, + int32 first_input_t, + int32 top_subsampling_factor, + Posterior *post_at_input); + + /** + Does the adapted pass of training. + @param [in] input The adapted input features. + @param [in] computation The adapted version of the + computation (this one uses the outputs + "output" and "output-xent" instead of + "output-si" and "output-si-xent". + @param [in] supervision The chain supervision + object, containing information derived + from the numerator lattices. + @param [in] deriv_weights Weights to be applied to the derivatives for the + corresponding frames of the output (order is: + first frame for all sequences; second frame for + all sequences, etc.). May be stored with the + egs. If this is the empty vector or + --apply-deriv-weights=false, they won't be + appplied. + @param [in,out] input_deriv If non-NULL, the + feature derivative w.r.t. the [speaker-adapted] input + features will be *added* to this location. + @return + */ + bool TrainAdapted(const CuMatrixBase &input, + const NnetComputation &computation, + const chain::Supervision &supervision, + const CuVectorBase &deriv_weights, + CuMatrixBase *input_deriv); + + + void ProcessOutputs(const NnetChainExample &eg, + NnetComputer *computer); + + std::string lang_name_; + + const NnetChainaTrainingOptions &opts_; + chain::DenominatorGraph den_graph_; + const differentiable_transform::DifferentiableTransform &transform_; + // This is a pointer to a compiler owned outside this class (we had to + // implement it like this to enable computation caching to work with a single + // option). + CachingOptimizingCompiler *compiler_; + + + Nnet *nnet_; + Nnet *delta_nnet_; // Only used if momentum != 0.0 or max-param-change != + // 0.0. nnet representing accumulated parameter-change + // (we'd call this gradient_nnet_, but due to + // natural-gradient update, it's better to consider it as + // a delta-parameter nnet. + + + // These objects keep track of the objective-function values for the 4 + // outputs. We have the regular output (sequence objective) and the 'xent' + // output for cross-entropy regularization, and there are speaker independent + // (si) versions of those outputs also. + ObjectiveFunctionInfo output_si_objf_; + ObjectiveFunctionInfo output_si_xent_objf_; + ObjectiveFunctionInfo output_objf_; + ObjectiveFunctionInfo output_xent_objf_; + + // Number of minibatches processed. Note: we actually train the nnet twice + // per minibatch, because there are the speaker-independent and + // speaker-dependent passes. + int32 num_minibatches_processed_; + + // stats for max-change (for speaker-independent model). + std::vector num_max_change_per_component_applied_si_; + int32 num_max_change_global_applied_si_; + // stats for max-change (for speaker-dependent model). + std::vector num_max_change_per_component_applied_; + int32 num_max_change_global_applied_; +}; + + + +/** + This object, which has a similar function to NnetChainTrainer, takes care of + evaluating and possibly training the 'bottom' model. */ -class NnetChainTrainer { +class NnetChainaBottomTrainer { public: - NnetChainTrainer(const NnetChainTrainingOptions &config, - const fst::StdVectorFst &den_fst, - Nnet *nnet); + /** + Constructor. + @param [in] nnet_config Options class + @param [in] train_bottom_model True if we are training the 'bottom' model + (otherwise this class just does the computation without + any backprop). + @param [in] bottom_subsampling_factor The factor by which we subsample + frames at the output of the 'bottom' nnet. E.g. if + this is 3, then the output frames in each sequence + would be numbered t=0, t=3, and so on. + @param [in,out] nnet The neural net we are training. Expected (for now) + to have an input called 'input' (corresponding to + the original input features and an output called + 'output' (corresponding to the embeddings). + */ + NnetChainaBottomTrainer(const NnetTrainerOptions &nnet_config, + int32 bottom_subsampling_factor, + bool train_bottom_model, + CachingOptimizingCompiler *compiler, + Nnet *nnet); + + /** Train on one minibatch. + @param [in] num_sequences The number of sequences/chunks represented + in 'input' (a.k.a. the minibatch size). + @param [in] first_input_t The 't' value corresponding to the first input + frame (will normally be a negative number). The 't' values at + the input will be consecutive, and the number of frames per sequence + will equal input.NumRows() / num_sequences. Note: if the embeddings + are computed at a lower frame rate than the original features, we + renumber things to make the embeddings consecutive. + (Note: bottom_subsampling_factor was passed in in the constructor). + @param [in] first_output_t The 't' value corresponding to the first output + frame (will normally be a negative number, corresponding to the left + context we are giving to the 'top' model, since we assume that the + sequences have 't' values starting from 0). The 't' values at + the output will be separated by the 'bottom_subsampling_factor' + which was given to the constructor. (We'll renumber them + by dividing them by 'bottom_subsampling_factor' before giving + them to the 'top' network. + @param [in] frames_per_sequence_out The number of output frames per sequence. + This is determined by the context of the top and bottom nnets + and the "keep_embedding_context" config value. + @param [in] input The input features, most likely raw MFCC or filterbank + features. A pointer, since it is consumed destructively + (via 'swap'). + @param [out] output The output will be written to here. + @return Returns the NnetComputer object that we did the computation with; + the user should either pass this into Backward(), or delete it. + */ + NnetComputer* Forward(int32 num_sequences, + int32 first_input_t, + int32 first_output_t, + int32 frames_per_sequence_out, + CuMatrix *input, + CuMatrix *output); + + + /** + Does the backward pass, which will do model training. This will only be + called if the bottom nnet needs to be trained (otherwise the caller will + delete the 'computer' object. + @param [in] computer The computer object returned from the + forward pass. This function takes ownership of it and + will delete it when done with it. + @param [in] output_deriv The derivative w.r.t. the output of + the forward pass. It is consumed destructively + by this function. + + */ + void Backward(NnetComputer *computer, + CuMatrix *output_deriv); - // train on one minibatch. - void Train(const NnetChainExample &eg); // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; @@ -78,52 +616,208 @@ class NnetChainTrainer { // per-component max-change and global max-change were enforced. void PrintMaxChangeStats() const; - ~NnetChainTrainer(); + ~NnetChainaBottomTrainer(); private: - // The internal function for doing one step of conventional SGD training. - void TrainInternal(const NnetChainExample &eg, - const NnetComputation &computation); - - // The internal function for doing one step of backstitch training. Depending - // on whether is_backstitch_step1 is true, It could be either the first - // (backward) step, or the second (forward) step of backstitch. - void TrainInternalBackstitch(const NnetChainExample &eg, - const NnetComputation &computation, - bool is_backstitch_step1); - - void ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, + + // We use this as an index with which to look up computations, kind of like a + // lookaside buffer; it avoids creating a much larger structure with large + // vectors of Indexes in it. + struct ComputationStructure { + bool train_model; + int32 num_sequences; + int32 frames_per_sequence_in; + int32 frames_per_sequence_out; + int32 first_input_t; + int32 first_output_t; + inline bool operator == (const ComputationStructure &other) const { + return train_model == other.train_model && + num_sequences == other.num_sequences && + frames_per_sequence_in == other.frames_per_sequence_in && + frames_per_sequence_out == other.frames_per_sequence_out && + first_input_t == other.first_input_t && + first_output_t == other.first_output_t; + }; + ComputationStructure (const ComputationStructure &other) = default; + ComputationStructure &operator = ( + const ComputationStructure &other) = default; + /** + Constructor. + @param [in] train_model True if we are going to train the bottom model. + @param [in] need_input_deriv True if we need the derivative w.r.t. + the features that are the input to this computation. + @param [in] num_sequences The number of sequences in this minibatch + (a.k.a. the minibatch size). + @param [in] frames_per_sequence_in The number of frames for each sequence + of input features. They are assumed to be consecutively + numbered. + @param [in] frames_per_sequence_out The 'frames_per_sequence' in + the ChainSupervision object, i.e. the length of the + output sequences of the computation. + @param [in] first_input_t The first 't' value in the input + sequence; will normally be negative (corresponding to + the negative of the number of frames of left context). + */ + ComputationStructure(bool train_model, + int32 num_sequences, + int32 frames_per_sequence_in, + int32 frames_per_sequence_out, + int32 first_input_t, + int32 first_output_t); + }; + struct ComputationHasher { + inline size_t operator() (const ComputationStructure &s) const { + return size_t(s.num_sequences) + + 10 * size_t(s.frames_per_sequence_in) + + 100 * size_t(s.frames_per_sequence_out) + + 1000 * size_t(s.first_input_t) + + 10000 * size_t(s.first_output_t); + } + }; + + // This is a faster lookup mechanism for the computation than + // is provided by the compiler's inherent caching. + std::unordered_map, + ComputationHasher> computation_map_; + + // This wraps the call to the compiler. See constructor + // of struct ComputationStructure for more documentation. + std::shared_ptr GetComputation( + const ComputationStructure &s); + + + + /** + Converts the format of the posterior from how it is at the output of the + network to how it is at the input (i.e. in the embedding space). + Basically, this will consist of padding with empty posteriors for the + "context frames", and possibly upsampling the posteriors (by just repeating + each one for, say, 3 frames, if top_subsampling_factor == 3). + + The number of frames per sequence at the output will equal + post_at_output.size() / num_sequences, and the number of frames per + sequence at the input will equal post_at_inptu->size() / num_sequences + (note: this means 'post_at_input is expected to be appropriately sized + when this function is called). + */ + void ConvertPosterior(const Posterior &post_at_output, + int32 num_sequences, + int32 first_input_t, + int32 top_subsampling_factor, + Posterior *post_at_input); + + /** + Does the adapted pass of training. + @param [in] input The adapted input features. + @param [in] computation The adapted version of the + computation (this one uses the outputs + "output" and "output-xent" instead of + "output-si" and "output-si-xent". + @param [in] supervision The chain supervision + object, containing information derived + from the numerator lattices. + @param [in,out] input_deriv If non-NULL, the + feature derivative w.r.t. the [speaker-adapted] input + features will be *added* to this location. + */ + void TrainAdapted(const CuMatrixBase &input, + const NnetComputation &computation, + const chain::Supervision &supervision, + const VectorBase &deriv_weights, + CuMatrixBase *input_deriv); + + + void ProcessOutputs(const NnetChainExample &eg, NnetComputer *computer); - const NnetChainTrainingOptions opts_; + std::string lang_name_; + const NnetChainaTrainingOptions opts_; + bool train_top_model_; chain::DenominatorGraph den_graph_; + const differentiable_transform::DifferentiableTransform &transform_; + Nnet *nnet_; Nnet *delta_nnet_; // Only used if momentum != 0.0 or max-param-change != // 0.0. nnet representing accumulated parameter-change // (we'd call this gradient_nnet_, but due to // natural-gradient update, it's better to consider it as // a delta-parameter nnet. - CachingOptimizingCompiler compiler_; - // This code supports multiple output layers, even though in the - // normal case there will be just one output layer named "output". - // So we store the objective functions per output layer. + // This is a pointer to a compiler owned outside this class (we had to + // implement it like this to enable computation caching to work with a single + // option). + CachingOptimizingCompiler *compiler_; + + // These objects keep track of the objective-function values for the 4 + // outputs. We have the regular output (sequence objective) and the 'xent' + // output for cross-entropy regularization, and there are speaker independent + // (si) versions of those outputs also. + ObjectiveFunctionInfo output_si_objf_; + ObjectiveFunctionInfo output_si_xent_objf_; + ObjectiveFunctionInfo output_objf_; + ObjectiveFunctionInfo output_xent_objf_; + + // Number of minibatches processed. Note: we actually train the nnet twice + // per minibatch, because there are the speaker-independent and + // speaker-dependent passes. int32 num_minibatches_processed_; - // stats for max-change. + // stats for max-change (for speaker-independent model). + std::vector num_max_change_per_component_applied_si_; + int32 num_max_change_global_applied_si_; + // stats for max-change (for speaker-dependent model). std::vector num_max_change_per_component_applied_; int32 num_max_change_global_applied_; +}; + + + +/** + This class is for single-threaded training of neural nets using the 'chain' + model and our adaptation framework +*/ +class NnetChainaTrainer { + public: + /** + Constructor + @param [in] config Options class + @param [in] models Object that provides access to the models and + denominator FSTs, indexed as appropriate by language-id. + */ + NnetChainaTrainer(const NnetChainaTrainingOptions &config, + NnetChainaModels *models); + + // train on one minibatch. + void Train(const NnetChainExample &eg); + + // Prints out the final stats, and return true if there was a nonzero count. + bool PrintTotalStats() const; - unordered_map objf_info_; + // Prints out the max-change stats (if nonzero): the percentage of time that + // per-component max-change and global max-change were enforced. + void PrintMaxChangeStats() const; + + ~NnetChainaTrainer(); + private: + + + const NnetChainaTrainingOptions &opts_; + NnetChainaModels *models_; + // This 'compiler' object is shared by bottom_trainer and the objects + // stores in top_trainers_. Storing it here is helpful to simplify writing and + // reading of computation caches. + CachingOptimizingCompiler compiler_; - // This value is used in backstitch training when we need to ensure - // consistent dropout masks. It's set to a value derived from rand() - // when the class is initialized. - int32 srand_seed_; + NnetChainaBottomTrainer *bottom_trainer_; + // map from language name (e.g. "default", "english", "french") to + // the object that trains the corresponding 'top' nnet. + std::unordered_map top_trainers_; }; } // namespace nnet3 } // namespace kaldi -#endif // KALDI_NNET3_NNET_CHAIN_TRAINING_H_ +#endif // KALDI_NNET3_NNET_CHAINA_TRAINING_H_ diff --git a/src/nnet3a/nnet-chaina-utils.h b/src/nnet3a/nnet-chaina-utils.h new file mode 100644 index 00000000000..0259459496f --- /dev/null +++ b/src/nnet3a/nnet-chaina-utils.h @@ -0,0 +1,39 @@ +// nnet3a/nnet-chaina-utils.h + +// Copyright 2015-2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET3_NNET_CHAINA_UTILS_H_ +#define KALDI_NNET3_NNET_CHAINA_UTILS_H_ + +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-computation.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-optimize.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-training.h" +#include "chain/chain-training.h" +#include "chain/chain-den-graph.h" + +namespace kaldi { +namespace nnet3 { + + +} // namespace nnet3 +} // namespace kaldi + +#endif // KALDI_NNET3_NNET_CHAINA_UTILS_H_ diff --git a/src/nnet3a/notes.update b/src/nnet3a/notes.update index f9e00758821..af3464e4e32 100644 --- a/src/nnet3a/notes.update +++ b/src/nnet3a/notes.update @@ -1,10 +1,15 @@ +-- Extend nnet3-copy-egs, to supply at least a minimum context in input features by + duplicating frames as needed. E.g. + --extend-left-context=12 --extend-right-context=10 + +============= Plans for binaries. nnet3-adapt --init|--copy|--adapt - +================ steps/chaina/init_chain_dir.sh make den.fst, normalization.fst, @@ -15,10 +20,10 @@ init.config, init.raw, 0.trans_mdl, final.config (but not 0.raw yet, might need egs first). - +============ nnet3-get-egs? - ... Make sure the length info and left/right context of each eg is included in the id? + ... Make sure the length info and left/right context of each eg is included in the id? - when we merge, steps/chaina/get_raw_egs.sh @@ -57,6 +62,27 @@ nnet3-get-egs? --l2-regularize-factor (use same one). --train-bottom-nnet {true,false} +==== + nnet3-copy-egs: maybe introduce an option to extend context? + +=== + +prepare_egs.sh... + - merging into speaker groups. done by python script. Originally we'll dump with: + + utterance-id-{num_frames_out}-{frame_subsampling_factor}-{left_context}-{right_context} + + - so the number of input frames would be + ((num_frames_out - 1) * frame_subsampling_factor) + 1 + left_context + right_context + + + utterance-id-{num_frames_out}-{frame_subsampling_factor}-{left_context}-{right_context} + +=== + + BUT, we don't want to do this on minibatches + + ==== - Merging egs: will already have merged into speaker groups in prepare_egs. - Output names? output --> output-xent. @@ -73,11 +99,24 @@ nnet3-get-egs? in process_egs.sh (the merging will be done in python). process_egs.sh will dump these as archives *and* scp files, but they will now - be in groups of 4. The network name will be added as the last-but-one field - in the key; we'll set it to 'default' by default, but it may be changed in - merge_egs.sh. The last field will be a weight to be incorporated just before - the final merge (by nnet3-chain-merge-egs with the --interpret-keys option). - So the keys at the input to the final merge will be of the form + be in groups of chunks_per_spk (e.g. 4). The language name will be added as the + last-but-two field in the key; we'll set it to 'default' by default, but it may + be changed in merge_egs.sh. The last two fields will be (1) a weight to be incorporated + just before the final merge (by nnet3-chain-merge-egs with the --interpret-keys + option), and (2) a weight to propagate back to the bottom network (if you want a + particular language to have less of an effect on the bottom network). + + So the keys at the input to the final merge will be of the form: + {language-name}-{egs-weight}-{bottom-nnet-weight} + + And the keys at the output of the final merge would be of the form: + {language-name}-{bottom-nnet-weight}-0-0 + The 'egs-weight' (which becomes weight in the chain supervision objects, + which is a scale on the objective function) will already have been set + in the ChainSupervision object. + The 0 and 0 becom + + info/chunks_per_spk @@ -196,7 +235,7 @@ we'll merge in a controlled way, e.g. nnet3-merge-egs --fixed the 't' values in the chain supervision object, and the top network will actually run at the reduced frame rate. - --top-network-is-recurrent is true if the top network is + --keep-embedding-context is true if the top network is recurrent and therefore we need to keep as much extra context as possible in the features. From 88226e3c97de93821f35c93dde09fb34dd596a8c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 21 Dec 2018 14:15:32 -0800 Subject: [PATCH 39/87] [src] More code progress. --- src/nnet3/nnet-batch-compute.cc | 2 +- src/nnet3/nnet-chain-example.cc | 36 ++++-- src/nnet3/nnet-chain-example.h | 7 + src/nnet3/nnet-chain-training.cc | 52 ++------ src/nnet3/nnet-chain-training.h | 14 +- src/nnet3/nnet-training.cc | 47 ++----- src/nnet3/nnet-training.h | 8 +- src/nnet3/nnet-utils.cc | 42 ++++++ src/nnet3/nnet-utils.h | 29 +++++ src/nnet3a/nnet-chaina-training.cc | 198 ++++++++++++++++------------- src/nnet3a/nnet-chaina-training.h | 117 +++-------------- src/nnet3a/notes.update | 3 +- 12 files changed, 255 insertions(+), 300 deletions(-) diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc index 6db046796be..a4baceb2d7f 100644 --- a/src/nnet3/nnet-batch-compute.cc +++ b/src/nnet3/nnet-batch-compute.cc @@ -135,7 +135,7 @@ NnetBatchComputer::GetHighestPriorityComputation( int32 *minibatch_size_out, std::vector *tasks) { tasks->clear(); - std::unique_lock(mutex_); + std::unique_lock lock(mutex_); MapType::iterator iter = tasks_.begin(), end = tasks_.end(), best_iter = tasks_.end(); double highest_priority = -std::numeric_limits::infinity(); diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index c627bb1032a..c31c7630eec 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -33,13 +33,18 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const { supervision.Write(os, binary); WriteToken(os, binary, ""); deriv_weights.Write(os, binary); + if (chunks_per_spk != 1) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, chunks_per_spk); + } WriteToken(os, binary, ""); } bool NnetChainSupervision::operator == (const NnetChainSupervision &other) const { return name == other.name && indexes == other.indexes && supervision == other.supervision && - deriv_weights.ApproxEqual(other.deriv_weights); + deriv_weights.ApproxEqual(other.deriv_weights) && + chunks_per_spk == other.chunks_per_spk; } void NnetChainSupervision::Read(std::istream &is, bool binary) { @@ -47,17 +52,17 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) { ReadToken(is, binary, &name); ReadIndexVector(is, binary, &indexes); supervision.Read(is, binary); - std::string token; - ReadToken(is, binary, &token); - // in the future this back-compatibility code can be reworked. - if (token != "") { - KALDI_ASSERT(token == "" || token == ""); - if (token == "") - ReadVectorAsChar(is, binary, &deriv_weights); - else - deriv_weights.Read(is, binary); - ExpectToken(is, binary, ""); + // If the following fails, you may be using much older egs that are no longer + // supported to be read by the current code -> re-dump the egs. + ExpectToken(is, binary, ""); + deriv_weights.Read(is, binary); + if (PeekToken(is, binary) == 'C') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &chunks_per_spk); + } else { + chunks_per_spk = 1; } + ExpectToken(is, binary, ""); CheckDim(); } @@ -75,6 +80,8 @@ void NnetChainSupervision::CheckDim() const { frame_skip = indexes[supervision.num_sequences].t - first_frame, num_sequences = supervision.num_sequences, frames_per_sequence = supervision.frames_per_sequence; + KALDI_ASSERT(chunks_per_spk > 0 && + num_sequences % chunks_per_spk == 0); int32 k = 0; for (int32 i = 0; i < frames_per_sequence; i++) { for (int32 j = 0; j < num_sequences; j++,k++) { @@ -93,13 +100,15 @@ NnetChainSupervision::NnetChainSupervision(const NnetChainSupervision &other): name(other.name), indexes(other.indexes), supervision(other.supervision), - deriv_weights(other.deriv_weights) { CheckDim(); } + deriv_weights(other.deriv_weights), + chunks_per_spk(other.chunks_per_spk) { CheckDim(); } void NnetChainSupervision::Swap(NnetChainSupervision *other) { name.swap(other->name); indexes.swap(other->indexes); supervision.Swap(&(other->supervision)); deriv_weights.Swap(&(other->deriv_weights)); + std::swap(chunks_per_spk, other->chunks_per_spk); if (RandInt(0, 5) == 0) CheckDim(); } @@ -112,7 +121,8 @@ NnetChainSupervision::NnetChainSupervision( int32 frame_skip): name(name), supervision(supervision), - deriv_weights(deriv_weights) { + deriv_weights(deriv_weights), + chunks_per_spk(1) { // note: this will set the 'x' index to zero. indexes.resize(supervision.num_sequences * supervision.frames_per_sequence); diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index cdb7338994a..5122739a38c 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -79,6 +79,13 @@ struct NnetChainSupervision { /// to disk compactly as unsigned char. Vector deriv_weights; + /// This will be 1 in normal cases, but in the 'chaina' code (chain training + /// with adaptation) it will be set to the number of chunks/sequences per + /// speaker in this minibatch. For example if it's 4, then we are asserting + /// that sequences n=0 through 3 all come from the same speaker, n=4 through 7 + /// all come from the same speaker, and so on. + int32 chunks_per_spk; + // Use default assignment operator NnetChainSupervision() { } diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 481f7989131..d9562887817 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -33,6 +33,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, compiler_(*nnet, opts_.nnet_config.optimize_config, opts_.nnet_config.compiler_config), num_minibatches_processed_(0), + max_change_stats_(*nnet), srand_seed_(RandInt(0, 100000)) { if (opts.nnet_config.zero_component_stats) ZeroComponentStats(nnet); @@ -41,9 +42,6 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, opts.nnet_config.backstitch_training_interval > 0); delta_nnet_ = nnet_->Copy(); ScaleNnet(0.0, delta_nnet_); - const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); - num_max_change_per_component_applied_.resize(num_updatable, 0); - num_max_change_global_applied_ = 0; if (opts.nnet_config.read_cache != "") { bool binary; @@ -119,9 +117,11 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, delta_nnet_); // Updates the parameters of nnet - bool success = UpdateNnetWithMaxChange(*delta_nnet_, - nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + 1.0, 1.0 - nnet_config.momentum, nnet_, + &max_change_stats_); // Scale down the batchnorm stats (keeps them fresh... this affects what // happens when we use the model with batchnorm test-mode set). @@ -176,9 +176,10 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, } // Updates the parameters of nnet - UpdateNnetWithMaxChange(*delta_nnet_, - nnet_config.max_param_change, max_change_scale, scale_adding, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + UpdateNnetWithMaxChange( + *delta_nnet_, nnet_config.max_param_change, + max_change_scale, scale_adding, nnet_, + &max_change_stats_); if (is_backstitch_step1) { // The following will only do something if we have a LinearComponent or @@ -276,41 +277,10 @@ bool NnetChainTrainer::PrintTotalStats() const { const ObjectiveFunctionInfo &info = iter->second; ans = info.PrintTotalStats(name) || ans; } - PrintMaxChangeStats(); + max_change_stats_.Print(*nnet_); return ans; } -void NnetChainTrainer::PrintMaxChangeStats() const { - KALDI_ASSERT(delta_nnet_ != NULL); - const NnetTrainerOptions &nnet_config = opts_.nnet_config; - int32 i = 0; - for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { - Component *comp = delta_nnet_->GetComponent(c); - if (comp->Properties() & kUpdatableComponent) { - UpdatableComponent *uc = dynamic_cast(comp); - if (uc == NULL) - KALDI_ERR << "Updatable component does not inherit from class " - << "UpdatableComponent; change this code."; - if (num_max_change_per_component_applied_[i] > 0) - KALDI_LOG << "For " << delta_nnet_->GetComponentName(c) - << ", per-component max-change was enforced " - << (100.0 * num_max_change_per_component_applied_[i]) / - (num_minibatches_processed_ * - (nnet_config.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / nnet_config.backstitch_training_interval)) - << " \% of the time."; - i++; - } - } - if (num_max_change_global_applied_ > 0) - KALDI_LOG << "The global max-change was enforced " - << (100.0 * num_max_change_global_applied_) / - (num_minibatches_processed_ * - (nnet_config.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / nnet_config.backstitch_training_interval)) - << " \% of the time."; -} - NnetChainTrainer::~NnetChainTrainer() { if (opts_.nnet_config.write_cache != "") { Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache); diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 5bf6a3f6fce..bc5143491ac 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -64,10 +64,6 @@ class NnetChainTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; - // Prints out the max-change stats (if nonzero): the percentage of time that - // per-component max-change and global max-change were enforced. - void PrintMaxChangeStats() const; - ~NnetChainTrainer(); private: // The internal function for doing one step of conventional SGD training. @@ -88,11 +84,8 @@ class NnetChainTrainer { chain::DenominatorGraph den_graph_; Nnet *nnet_; - Nnet *delta_nnet_; // Only used if momentum != 0.0 or max-param-change != - // 0.0. nnet representing accumulated parameter-change - // (we'd call this gradient_nnet_, but due to - // natural-gradient update, it's better to consider it as - // a delta-parameter nnet. + Nnet *delta_nnet_; // stores the change to the parameters on each training + // iteration. CachingOptimizingCompiler compiler_; // This code supports multiple output layers, even though in the @@ -101,8 +94,7 @@ class NnetChainTrainer { int32 num_minibatches_processed_; // stats for max-change. - std::vector num_max_change_per_component_applied_; - int32 num_max_change_global_applied_; + MaxChangeStats max_change_stats_; unordered_map objf_info_; diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 0acaa5c2008..b4563c7a2c3 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -30,6 +30,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config, nnet_(nnet), compiler_(*nnet, config_.optimize_config, config_.compiler_config), num_minibatches_processed_(0), + max_change_stats_(*nnet), srand_seed_(RandInt(0, 100000)) { if (config.zero_component_stats) ZeroComponentStats(nnet); @@ -38,9 +39,6 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config, config.backstitch_training_interval > 0); delta_nnet_ = nnet_->Copy(); ScaleNnet(0.0, delta_nnet_); - const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); - num_max_change_per_component_applied_.resize(num_updatable, 0); - num_max_change_global_applied_ = 0; if (config_.read_cache != "") { bool binary; @@ -111,9 +109,9 @@ void NnetTrainer::TrainInternal(const NnetExample &eg, delta_nnet_); // Update the parameters of nnet - bool success = UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change, - 1.0, 1.0 - config_.momentum, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, config_.max_param_change, + 1.0, 1.0 - config_.momentum, nnet_, &max_change_stats_); // Scale down the batchnorm stats (keeps them fresh... this affects what // happens when we use the model with batchnorm test-mode set). @@ -167,9 +165,10 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg, } // Updates the parameters of nnet - UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change, + UpdateNnetWithMaxChange( + *delta_nnet_, config_.max_param_change, max_change_scale, scale_adding, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + &max_change_stats_); if (is_backstitch_step1) { // The following will only do something if we have a LinearComponent or @@ -236,40 +235,10 @@ bool NnetTrainer::PrintTotalStats() const { bool ok = info.PrintTotalStats(name); ans = ans || ok; } - PrintMaxChangeStats(); + max_change_stats_.Print(*nnet_); return ans; } -void NnetTrainer::PrintMaxChangeStats() const { - KALDI_ASSERT(delta_nnet_ != NULL); - int32 i = 0; - for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { - Component *comp = delta_nnet_->GetComponent(c); - if (comp->Properties() & kUpdatableComponent) { - UpdatableComponent *uc = dynamic_cast(comp); - if (uc == NULL) - KALDI_ERR << "Updatable component does not inherit from class " - << "UpdatableComponent; change this code."; - if (num_max_change_per_component_applied_[i] > 0) - KALDI_LOG << "For " << delta_nnet_->GetComponentName(c) - << ", per-component max-change was enforced " - << (100.0 * num_max_change_per_component_applied_[i]) / - (num_minibatches_processed_ * - (config_.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / config_.backstitch_training_interval)) - << " \% of the time."; - i++; - } - } - if (num_max_change_global_applied_ > 0) - KALDI_LOG << "The global max-change was enforced " - << (100.0 * num_max_change_global_applied_) / - (num_minibatches_processed_ * - (config_.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / config_.backstitch_training_interval)) - << " \% of the time."; -} - void ObjectiveFunctionInfo::UpdateStats( const std::string &output_name, int32 minibatches_per_phase, diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index fffc621930a..64ec7abc58e 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -26,6 +26,7 @@ #include "nnet3/nnet-compute.h" #include "nnet3/nnet-optimize.h" #include "nnet3/nnet-example-utils.h" +#include "nnet3/nnet-utils.h" namespace kaldi { namespace nnet3 { @@ -187,10 +188,6 @@ class NnetTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; - // Prints out the max-change stats (if nonzero): the percentage of time that - // per-component max-change and global max-change were enforced. - void PrintMaxChangeStats() const; - ~NnetTrainer(); private: // The internal function for doing one step of conventional SGD training. @@ -220,8 +217,7 @@ class NnetTrainer { int32 num_minibatches_processed_; // stats for max-change. - std::vector num_max_change_per_component_applied_; - int32 num_max_change_global_applied_; + MaxChangeStats max_change_stats_; unordered_map objf_info_; diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index e020f8fc6a7..8bc3f12027b 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -2173,5 +2173,47 @@ void ApplyL2Regularization(const Nnet &nnet, } +bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, + BaseFloat max_param_change, + BaseFloat max_change_scale, + BaseFloat scale, Nnet *nnet, + MaxChangeStats *stats) { + bool ans = UpdateNnetWithMaxChange( + delta_nnet, max_param_change, max_change_scale, + scale, nnet, + &(stats->num_max_change_per_component_applied), + &(stats->num_max_change_global_applied)); + stats->num_minibatches_processed++; + return ans; +} + + +void MaxChangeStats::Print(const Nnet &nnet) const { + int32 i = 0; + for (int32 c = 0; c < nnet.NumComponents(); c++) { + const Component *comp = nnet.GetComponent(c); + if (comp->Properties() & kUpdatableComponent) { + const UpdatableComponent *uc = dynamic_cast( + comp); + if (uc == NULL) + KALDI_ERR << "Updatable component does not inherit from class " + << "UpdatableComponent; change this code."; + if (num_max_change_per_component_applied[i] > 0) + KALDI_LOG << "For " << nnet.GetComponentName(c) + << ", per-component max-change was enforced " + << ((100.0 * num_max_change_per_component_applied[i]) / + num_minibatches_processed) + << " \% of the time."; + i++; + } + } + if (num_max_change_global_applied > 0) + KALDI_LOG << "The global max-change was enforced " + << ((100.0 * num_max_change_global_applied) / + num_minibatches_processed) + << " \% of the time."; +} + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 787bd228a38..7e68291b9dc 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -377,6 +377,17 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, num_max_change_per_component_applied, int32 *num_max_change_global_applied); +class MaxChangeStats; + +// This overloaded version of UpdateNnetWithMaxChange() is a convenience +// wrapper for when you have a MaxChangeStats object to keep track +// of how many times the max-change was applied. See documentation above. +bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, + BaseFloat max_param_change, + BaseFloat max_change_scale, + BaseFloat scale, Nnet *nnet, + MaxChangeStats *stats); + /** This function is used as part of the regular training workflow, prior to @@ -513,6 +524,24 @@ int32 GetNumNvalues(const std::vector &io_vec, bool exhaustive); +struct MaxChangeStats { + int32 num_max_change_global_applied; + int32 num_minibatches_processed; + std::vector num_max_change_per_component_applied; + + MaxChangeStats(const Nnet &nnet): + num_max_change_global_applied(0), + num_minibatches_processed(0), + num_max_change_per_component_applied(NumUpdatableComponents(nnet), 0) { } + + // Prints the max-change stats. Usually will be called at the end + // of the program. The nnet is only needed for structural information, + // to work out the component names. + void Print(const Nnet &nnet) const; +}; + + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index a506dd75855..e2f58bf9c79 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -30,22 +30,18 @@ NnetChainaTopTrainer::NnetChainaTopTrainer( const NnetChainaTrainingOptions &config, const fst::StdVectorFst &den_fst, const differentiable_transform::DifferentiableTransform &transform, - CachingOptimizingCompiler *compiler, Nnet *nnet): lang_name_(lang_name), opts_(config), den_graph_(den_fst, nnet->OutputDim("output")), transform_(transform), - compiler_(compiler), + compiler_(*nnet, opts_.nnet_config.optimize_config, + opts_.nnet_config.compiler_config), nnet_(nnet), delta_nnet_(nnet->Copy()), num_minibatches_processed_(0), - num_max_change_global_applied_si_(0), - num_max_change_global_applied_(0) { - - const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); - num_max_change_per_component_applied_.resize(num_updatable, 0); - num_max_change_per_component_applied_si_.resize(num_updatable, 0); + max_change_stats_si_(*nnet), + max_change_stats_(*nnet) { if (opts_.nnet_config.zero_component_stats) ZeroComponentStats(nnet); @@ -122,7 +118,7 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( request.outputs[1].has_deriv = true; request.outputs[1].name = (s.adapted ? "output-xent" : "output-xent-si"); request.outputs[1].indexes = request.outputs[0].indexes; - std::shared_ptr computation = compiler_->Compile( + std::shared_ptr computation = compiler_.Compile( request); computation_map_[s] = computation; return computation; @@ -223,9 +219,7 @@ bool NnetChainaTopTrainer::TrainUnadapted( nnet_config.max_param_change, opts_.unadapted_deriv_scale, 1.0 - nnet_config.momentum, // normally momentum is 0.0. - nnet_, - &num_max_change_per_component_applied_si_, - &num_max_change_global_applied_si_); + nnet_, &max_change_stats_si_); // Un-freeze the natural gradient. FreezeNaturalGradient(false, delta_nnet_); @@ -315,25 +309,16 @@ bool NnetChainaTopTrainer::TrainAdapted( input_deriv->AddMat(1.0, computer.GetOutput("input")); } - // Updates the parameters of nnet. Since the derivatives will all be scaled - // with "unadapted_deriv_scale" it makes sense to apply that same factor to - // the max-change, to keep the max-change in proportion with how much we - // expect the net to change (so smaller max-change values don't lead to more - // emphasize on the unadapted model's derivatives) + // Update the parameters of nnet. bool success = UpdateNnetWithMaxChange( *delta_nnet_, nnet_config.max_param_change, - opts_.unadapted_deriv_scale, + 1.0, 1.0 - nnet_config.momentum, // normally momentum is 0.0. - nnet_, - &num_max_change_per_component_applied_si_, - &num_max_change_global_applied_si_); + nnet_, &max_change_stats_); // Scale down the batchnorm stats (keeps them fresh... this affects what - // happens when we use the model with batchnorm test-mode set). - // Note: we don't do this for the unadapted pass, it would be redundant - // (although of course doing it only once changes the interpretation - // of the scale slightly). + // happens when, later on, we use the model with batchnorm test-mode set). ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); // The following will only do something if we have a LinearComponent @@ -417,6 +402,26 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, } +bool NnetChainaTopTrainer::PrintTotalStats() const { + bool ans = false; + if (output_si_objf_.PrintTotalStats(lang_name_ + ":output-si")) + ans = true; + if (output_objf_.PrintTotalStats(lang_name_ + ":output")) + ans = true; + if (output_si_xent_objf_.PrintTotalStats(lang_name_ + ":output-si-xent")) + ans = true; + if (output_xent_objf_.PrintTotalStats(lang_name_ + ":output-xent")) + ans = true; + KALDI_LOG << "Speaker-independent max-change stats for language " + << lang_name_ << ":"; + max_change_stats_si_.Print(*nnet_); + KALDI_LOG << "Speaker-dependent max-change stats for language " + << lang_name_ << ":"; + max_change_stats_.Print(*nnet_); + return ans; +} + + NnetComputer* NnetChainaBottomTrainer::Forward( int32 num_sequences, int32 first_input_t, @@ -431,14 +436,17 @@ NnetComputer* NnetChainaBottomTrainer::Forward( frames_per_sequence_in, frames_per_sequence_out, first_input_t, first_output_t); + // Note: this will be cached in the unordered_map owned by this class, so we + // don't have to worry about it being deleted before we're done with the + // NnetComputer object. std::shared_ptr computation = GetComputation(s); const NnetTrainerOptions &nnet_config = opts_.nnet_config; NnetComputer *computer = new NnetComputer(nnet_config.compute_config, - computation, nnet_, delta_nnet_); - computer.AcceptInput("input", input); - computer.Run(); - computer.GetOutputDestructive("output", output); + *computation, nnet_, delta_nnet_); + computer->AcceptInput("input", input); + computer->Run(); + computer->GetOutputDestructive("output", output); return computer; } @@ -448,22 +456,56 @@ void NnetChainaBottomTrainer::Backward(NnetComputer *computer, computer->AcceptInput("output", output_deriv); computer->Run(); - // TODO. + const NnetTrainerOptions &nnet_config = opts_.nnet_config; - // Updates the parameters of nnet. Since the derivatives will all be scaled - // with "unadapted_deriv_scale" it makes sense to apply that same factor to - // the max-change, to keep the max-change in proportion with how much we - // expect the net to change (so smaller max-change values don't lead to more - // emphasize on the unadapted model's derivatives) + // we may later provide a way to set a different max-change for the bottom + // nnet than on the top nnet. bool success = UpdateNnetWithMaxChange( *delta_nnet_, nnet_config.max_param_change, - opts_.unadapted_deriv_scale, + 1.0, 1.0 - nnet_config.momentum, // normally momentum is 0.0. nnet_, - &num_max_change_per_component_applied_si_, - &num_max_change_global_applied_si_); + &max_change_stats_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when, later on, we use the model with batchnorm test-mode set). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + + if (!success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); + num_minibatches_processed_++; +} + + +NnetChainaBottomTrainer::NnetChainaBottomTrainer( + const NnetChainaTrainingOptions &opts, + Nnet *nnet): + opts_(opts), + nnet_(nnet), + delta_nnet_(nnet->Copy()), + compiler_(*nnet, opts_.nnet_config.optimize_config, + opts_.nnet_config.compiler_config), + max_change_stats_(*nnet) { + if (opts_.nnet_config.zero_component_stats) + ZeroComponentStats(nnet); + ScaleNnet(0.0, delta_nnet_); + if (opts_.nnet_config.read_cache != "") { + // It would be complicated to implement, as there are various top nnets + // and they would all try to read and write the same cache files. + // To implement this, the best way would be to + KALDI_WARN << "The read-cache options are not currently supported."; + } + KALDI_ASSERT(opts_.nnet_config.momentum >= 0.0 && + opts_.nnet_config.max_param_change >= 0.0 && + opts_.bottom_subsampling_factor >= 1); } std::shared_ptr NnetChainaBottomTrainer::GetComputation( @@ -480,15 +522,16 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( first_output_t = s.first_output_t; ComputationRequest request; - request.need_model_derivative = train_bottom_model_; + request.need_model_derivative = opts_.train_bottom_nnet; request.store_component_stats = true; request.inputs.resize(1); request.inputs[0].name = "input"; request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); // The inputs are in the order: all frames of sequence 0; then all frames of - // sequence 1; and so on. This is done + // sequence 1; and so on. This is how the example-merging code does it, since + // it's more convenient when dealing with compressed matrices and the like. auto iter = request.inputs[0].indexes.begin(); - for (int32 n = n < num_sequences; n++) { + for (int32 n = 0; n < num_sequences; n++) { for (int32 t = first_input_t; t < first_input_t + frames_per_sequence_in; ++t,++iter) { iter->n = n; @@ -499,14 +542,14 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( // the second frame of all sequences; and so on. request.outputs.resize(1); request.outputs[0].name = "output"; - request.outputs[1].has_deriv = train_bottom_model_; + request.outputs[1].has_deriv = opts_.train_bottom_nnet; request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences); - int32 t_stride_out = bottom_subsampling_factor_; + int32 t_stride_out = opts_.bottom_subsampling_factor; iter = request.outputs[0].indexes.begin(); for (int32 t = first_output_t; t < first_output_t + frames_per_sequence_out * t_stride_out; t += t_stride_out) { - for (int32 n = n < num_sequences; ++n,++iter) { + for (int32 n = 0; n < num_sequences; ++n,++iter) { iter->n = n; iter->t = t; } @@ -518,58 +561,33 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( } -bool NnetChainTrainer::PrintTotalStats() const { - unordered_map::const_iterator - iter = objf_info_.begin(), - end = objf_info_.end(); +bool NnetChainaTrainer::PrintTotalStats() const { + bottom_trainer_.PrintTotalStats(); bool ans = false; - for (; iter != end; ++iter) { - const std::string &name = iter->first; - const ObjectiveFunctionInfo &info = iter->second; - ans = info.PrintTotalStats(name) || ans; - } - PrintMaxChangeStats(); + for (auto iter = top_trainers_.begin(); iter != top_trainers_.end(); + ++iter) + if (iter->second->PrintTotalStats()) + ans = true; return ans; } -void NnetChainTrainer::PrintMaxChangeStats() const { - KALDI_ASSERT(delta_nnet_ != NULL); - const NnetTrainerOptions &nnet_config = opts_.nnet_config; - int32 i = 0; - for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { - Component *comp = delta_nnet_->GetComponent(c); - if (comp->Properties() & kUpdatableComponent) { - UpdatableComponent *uc = dynamic_cast(comp); - if (uc == NULL) - KALDI_ERR << "Updatable component does not inherit from class " - << "UpdatableComponent; change this code."; - if (num_max_change_per_component_applied_[i] > 0) - KALDI_LOG << "For " << delta_nnet_->GetComponentName(c) - << ", per-component max-change was enforced " - << (100.0 * num_max_change_per_component_applied_[i]) / - (num_minibatches_processed_ * - (nnet_config.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / nnet_config.backstitch_training_interval)) - << " \% of the time."; - i++; - } - } - if (num_max_change_global_applied_ > 0) - KALDI_LOG << "The global max-change was enforced " - << (100.0 * num_max_change_global_applied_) / - (num_minibatches_processed_ * - (nnet_config.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / nnet_config.backstitch_training_interval)) - << " \% of the time."; +NnetChainaTrainer::NnetChainaTrainer( + const NnetChainaTrainingOptions &config, + NnetChainaModels *models): + opts_(config), + models_(models), + bottom_trainer_(opts_, models->GetBottomNnet()) { } -NnetChainTrainer::~NnetChainTrainer() { - if (opts_.nnet_config.write_cache != "") { - Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache); - compiler_.WriteCache(ko.Stream(), opts_.nnet_config.binary_write_cache); - KALDI_LOG << "Wrote computation cache to " << opts_.nnet_config.write_cache; - } - delete delta_nnet_; + +void NnetChainaTrainer::Train(const NnetChainExample &eg) { + // TODO. work out structure, etc. +} + +NnetChainaTrainer::~NnetChainaTrainer() { + for (auto iter = top_trainers_.begin(); iter != top_trainers_.end(); + ++iter) + delete iter->second; } diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index aaad4858979..a8764e14133 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -250,8 +250,6 @@ class NnetChainaTopTrainer { @param [in] den_fst The denominator FST for this language @param [in] transform The transform object which will be used to produce adapted features after the first pass of training. - @param [in] compiler A pointer to the compiler we are to use (we make it - owned externally for easier caching). @param [in,out] nnet The neural net we are training. Expected to have outputs called "output-si" (speaker-independent output), "output", "output-si-xent", "output-xent", and an input called @@ -261,7 +259,6 @@ class NnetChainaTopTrainer { const NnetChainaTrainingOptions &config, const fst::StdVectorFst &den_fst, const differentiable_transform::DifferentiableTransform &transform, - CachingOptimizingCompiler *compiler, Nnet *nnet); /** Train on one minibatch. @@ -313,10 +310,6 @@ class NnetChainaTopTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; - // Prints out the max-change stats (if nonzero): the percentage of time that - // per-component max-change and global max-change were enforced. - void PrintMaxChangeStats() const; - ~NnetChainaTopTrainer(); private: @@ -492,19 +485,12 @@ class NnetChainaTopTrainer { const NnetChainaTrainingOptions &opts_; chain::DenominatorGraph den_graph_; const differentiable_transform::DifferentiableTransform &transform_; - // This is a pointer to a compiler owned outside this class (we had to - // implement it like this to enable computation caching to work with a single - // option). - CachingOptimizingCompiler *compiler_; + CachingOptimizingCompiler compiler_; Nnet *nnet_; - Nnet *delta_nnet_; // Only used if momentum != 0.0 or max-param-change != - // 0.0. nnet representing accumulated parameter-change - // (we'd call this gradient_nnet_, but due to - // natural-gradient update, it's better to consider it as - // a delta-parameter nnet. - + Nnet *delta_nnet_; // stores the change to the parameters on each training + // iteration. // These objects keep track of the objective-function values for the 4 // outputs. We have the regular output (sequence objective) and the 'xent' @@ -521,11 +507,9 @@ class NnetChainaTopTrainer { int32 num_minibatches_processed_; // stats for max-change (for speaker-independent model). - std::vector num_max_change_per_component_applied_si_; - int32 num_max_change_global_applied_si_; - // stats for max-change (for speaker-dependent model). - std::vector num_max_change_per_component_applied_; - int32 num_max_change_global_applied_; + MaxChangeStats max_change_stats_si_; + // stats for max-change (for speaker-adapted model). + MaxChangeStats max_change_stats_; }; @@ -538,23 +522,14 @@ class NnetChainaBottomTrainer { public: /** Constructor. - @param [in] nnet_config Options class - @param [in] train_bottom_model True if we are training the 'bottom' model - (otherwise this class just does the computation without - any backprop). - @param [in] bottom_subsampling_factor The factor by which we subsample - frames at the output of the 'bottom' nnet. E.g. if - this is 3, then the output frames in each sequence - would be numbered t=0, t=3, and so on. + @param [in] opts Options class. This class maintains a reference to it, + so don't delete it. @param [in,out] nnet The neural net we are training. Expected (for now) to have an input called 'input' (corresponding to the original input features and an output called 'output' (corresponding to the embeddings). */ - NnetChainaBottomTrainer(const NnetTrainerOptions &nnet_config, - int32 bottom_subsampling_factor, - bool train_bottom_model, - CachingOptimizingCompiler *compiler, + NnetChainaBottomTrainer(const NnetChainaTrainingOptions &opts, Nnet *nnet); /** Train on one minibatch. @@ -612,9 +587,6 @@ class NnetChainaBottomTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; - // Prints out the max-change stats (if nonzero): the percentage of time that - // per-component max-change and global max-change were enforced. - void PrintMaxChangeStats() const; ~NnetChainaBottomTrainer(); private: @@ -706,69 +678,19 @@ class NnetChainaBottomTrainer { int32 top_subsampling_factor, Posterior *post_at_input); - /** - Does the adapted pass of training. - @param [in] input The adapted input features. - @param [in] computation The adapted version of the - computation (this one uses the outputs - "output" and "output-xent" instead of - "output-si" and "output-si-xent". - @param [in] supervision The chain supervision - object, containing information derived - from the numerator lattices. - @param [in,out] input_deriv If non-NULL, the - feature derivative w.r.t. the [speaker-adapted] input - features will be *added* to this location. - */ - void TrainAdapted(const CuMatrixBase &input, - const NnetComputation &computation, - const chain::Supervision &supervision, - const VectorBase &deriv_weights, - CuMatrixBase *input_deriv); - - - void ProcessOutputs(const NnetChainExample &eg, - NnetComputer *computer); - - std::string lang_name_; - const NnetChainaTrainingOptions opts_; - bool train_top_model_; - chain::DenominatorGraph den_graph_; - const differentiable_transform::DifferentiableTransform &transform_; Nnet *nnet_; - Nnet *delta_nnet_; // Only used if momentum != 0.0 or max-param-change != - // 0.0. nnet representing accumulated parameter-change - // (we'd call this gradient_nnet_, but due to - // natural-gradient update, it's better to consider it as - // a delta-parameter nnet. - - // This is a pointer to a compiler owned outside this class (we had to - // implement it like this to enable computation caching to work with a single - // option). - CachingOptimizingCompiler *compiler_; + Nnet *delta_nnet_; // stores the change to the parameters on each training + // iteration. - // These objects keep track of the objective-function values for the 4 - // outputs. We have the regular output (sequence objective) and the 'xent' - // output for cross-entropy regularization, and there are speaker independent - // (si) versions of those outputs also. - ObjectiveFunctionInfo output_si_objf_; - ObjectiveFunctionInfo output_si_xent_objf_; - ObjectiveFunctionInfo output_objf_; - ObjectiveFunctionInfo output_xent_objf_; + CachingOptimizingCompiler compiler_; - // Number of minibatches processed. Note: we actually train the nnet twice - // per minibatch, because there are the speaker-independent and - // speaker-dependent passes. + // Number of minibatches processed. int32 num_minibatches_processed_; - // stats for max-change (for speaker-independent model). - std::vector num_max_change_per_component_applied_si_; - int32 num_max_change_global_applied_si_; - // stats for max-change (for speaker-dependent model). - std::vector num_max_change_per_component_applied_; - int32 num_max_change_global_applied_; + // stats for max-change + MaxChangeStats max_change_stats_; }; @@ -801,15 +723,14 @@ class NnetChainaTrainer { ~NnetChainaTrainer(); private: + void FindEgStructure + const NnetChainaTrainingOptions &opts_; + // pointer to object owned outside this class. NnetChainaModels *models_; - // This 'compiler' object is shared by bottom_trainer and the objects - // stores in top_trainers_. Storing it here is helpful to simplify writing and - // reading of computation caches. - CachingOptimizingCompiler compiler_; - NnetChainaBottomTrainer *bottom_trainer_; + NnetChainaBottomTrainer bottom_trainer_; // map from language name (e.g. "default", "english", "french") to // the object that trains the corresponding 'top' nnet. std::unordered_map Date: Tue, 25 Dec 2018 11:44:39 -0800 Subject: [PATCH 40/87] [src] Make sure differentiable-transform TrainingBackward() adds to its input --- src/adapt/differentiable-transform-test.cc | 9 +++++++++ src/adapt/differentiable-transform.cc | 17 ++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/adapt/differentiable-transform-test.cc b/src/adapt/differentiable-transform-test.cc index 34efac9770c..ff76d7c738f 100644 --- a/src/adapt/differentiable-transform-test.cc +++ b/src/adapt/differentiable-transform-test.cc @@ -136,6 +136,13 @@ void TestTraining(DifferentiableTransform *transform) { output_feats(num_rows, dim, kUndefined), output_deriv(num_rows, dim, kUndefined), input_deriv(num_rows, dim); + + // This is to verify that TrainingBackward() adds to, rather than + // setting to, the input deriv. + CuMatrix random_input_deriv(num_rows, dim); + random_input_deriv.SetRandn(); + input_deriv.AddMat(1.0, random_input_deriv); + input_feats.SetRandn(); output_deriv.SetRandn(); Posterior post; @@ -154,6 +161,8 @@ void TestTraining(DifferentiableTransform *transform) { transform->TrainingBackward(input_feats, output_deriv, num_chunks, num_spk, post, info, &input_deriv); + // testing that TrainingBackward adds to the input deriv. + input_deriv.AddMat(-1.0, random_input_deriv); int32 n = 5; Vector expected_changes(n), observed_changes(n); diff --git a/src/adapt/differentiable-transform.cc b/src/adapt/differentiable-transform.cc index 72e34e02764..e008f35adc7 100644 --- a/src/adapt/differentiable-transform.cc +++ b/src/adapt/differentiable-transform.cc @@ -234,7 +234,13 @@ void FmllrTransform::TrainingBackward( info->estimators[s]->GetVarDeriv()); info->target_model.AccStatsBackward(input_cpu, posteriors, &input_deriv_cpu); - input_deriv->CopyFromMat(input_deriv_cpu); + // These TrainingBackward() functions are all supposed to add to the + // 'input_deriv'. + CuMatrix input_deriv_temp(input_deriv->NumRows(), + input_deriv->NumCols(), + kUndefined); + input_deriv_temp.CopyFromMat(input_deriv_cpu); + input_deriv->AddMat(1.0, input_deriv_temp); delete info; } @@ -499,8 +505,13 @@ void MeanOnlyTransform::TrainingBackward( } info->target_model.AccStatsBackward(input_cpu, posteriors, &input_deriv_cpu); - input_deriv->CopyFromMat(input_deriv_cpu); - + // These TrainingBackward() functions are all supposed to add to the + // 'input_deriv'. + CuMatrix input_deriv_temp(input_deriv->NumRows(), + input_deriv->NumCols(), + kUndefined); + input_deriv_temp.CopyFromMat(input_deriv_cpu); + input_deriv->AddMat(1.0, input_deriv_temp); delete info; } From 0254935e3d6507699e939e436a61a51b9b2f3d16 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sat, 29 Dec 2018 10:02:02 -0500 Subject: [PATCH 41/87] Implement chain numerator posteriors --- src/chain/chain-training.cc | 30 +++++++++++++++++++++++++++--- src/chain/chain-training.h | 5 ++++- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 6b4a7b593c2..5cf9313fccb 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -28,6 +28,17 @@ namespace kaldi { namespace chain { +void ConvertDerivsToPosterior(const CuMatrixBase &numerator_derivs, + Posterior *numerator_post) { + numerator_post->resize(numerator_derivs.NumRows()); + for (size_t i = 0; i < numerator_derivs.NumRows(); ++i) { + const auto &row = numerator_derivs.Row(i); + for (size_t pdfid = 0; pdfid < row.Dim(); ++pdfid) + if (row(pdfid) != 0.0) + (*numerator_post)[i].push_back(std::make_pair(pdfid, row(pdfid))); + } +} + void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, @@ -37,7 +48,8 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrix *xent_output_deriv) { + CuMatrix *xent_output_deriv, + Posterior *numerator_post = NULL) { BaseFloat num_logprob_weighted, den_logprob_weighted; bool denominator_ok = true; bool numerator_ok = true; @@ -92,6 +104,11 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, numerator_ok = numerator_ok && (num_logprob_weighted - num_logprob_weighted == 0); + if (numerator_post && (xent_output_deriv || nnet_output_deriv)) { + ConvertDerivsToPosterior(nnet_output_deriv ? *nnet_output_deriv : + *xent_output_deriv, numerator_post); + } + *objf = num_logprob_weighted - den_logprob_weighted; if (!((*objf) - (*objf) == 0) || !denominator_ok || !numerator_ok) { // inf or NaN detected, or denominator computation returned false. @@ -146,11 +163,13 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrix *xent_output_deriv) { + CuMatrix *xent_output_deriv, + Posterior *numerator_post) { if (!supervision.e2e_fsts.empty()) { ComputeChainObjfAndDerivE2e(opts, den_graph, supervision, nnet_output, objf, l2_term, - weight, nnet_output_deriv, xent_output_deriv); + weight, nnet_output_deriv, + xent_output_deriv, numerator_post); return; } @@ -198,6 +217,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, } } + if (numerator_post && (xent_output_deriv || nnet_output_deriv)) { + ConvertDerivsToPosterior(nnet_output_deriv ? *nnet_output_deriv : + *xent_output_deriv, numerator_post); + } + *objf = num_logprob_weighted - den_logprob_weighted; *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 63e03c7e35f..440aeccda7c 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -28,6 +28,7 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" #include "fstext/fstext-lib.h" +#include "hmm/posterior.h" #include "tree/context-dep.h" #include "lat/kaldi-lattice.h" #include "matrix/kaldi-matrix.h" @@ -117,7 +118,9 @@ struct ChainTrainingOptions { used in computing the cross-entropy objective value. @param [out] numerator_post If non-NULL, then the posterior from the numerator forward-backward will be written here (note: it won't be - scaled by the supervision weight). This is intended for + scaled by the supervision weight). The order is the + same as the input (i.e., frame 0 for all sequences, + then frame 1, etc). This is intended for use in the adaptation framework used in "chaina" training. */ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, From 80e0f73279f174435a62d4b0d5b3daa2bbe924fb Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sat, 29 Dec 2018 12:45:58 -0500 Subject: [PATCH 42/87] Support merging already-merged egs --- src/chain/chain-supervision.cc | 9 ++++----- src/nnet3/nnet-chain-example.cc | 13 +++++++++++-- src/nnet3/nnet-example-utils.cc | 18 ++++++++++++------ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index f8a2c1d11cc..be727d333d2 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -720,17 +720,16 @@ Supervision::Supervision(const Supervision &other): void MergeSupervisionE2e(const std::vector &input, Supervision *output_supervision) { KALDI_ASSERT(!input.empty()); - KALDI_ASSERT(input[0]->e2e_fsts.size() == 1); *output_supervision = *(input[0]); output_supervision->e2e_fsts.reserve(input.size()); int32 frames_per_sequence = output_supervision->frames_per_sequence, num_seqs = input.size(); for (int32 i = 1; i < num_seqs; i++) { - output_supervision->num_sequences++; - KALDI_ASSERT(input[i]->e2e_fsts.size() == 1); + output_supervision->num_sequences += input[i]->num_sequences; KALDI_ASSERT(input[i]->frames_per_sequence == frames_per_sequence); - output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[0]); + for (int32 j = 0; j < input[i]->num_sequences; ++j) + output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[j]); } output_supervision->alignment_pdfs.clear(); // The program nnet3-chain-acc-lda-stats works on un-merged egs, @@ -766,7 +765,7 @@ void MergeSupervision(const std::vector &input, // append src.fst to output_supervision->fst. // the complexity here is O(V1 + E1) fst::Concat(src.fst, &output_supervision->fst); - output_supervision->num_sequences++; + output_supervision->num_sequences += src.num_sequences; } else { KALDI_ERR << "Mismatch weight or frames_per_sequence between inputs"; } diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index c31c7630eec..a0614da4916 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -221,6 +221,14 @@ static void MergeSupervision( &output_supervision); output->supervision.Swap(&output_supervision); + int32 example_stride = 0; + for (auto &index: inputs[0]->indexes) + if (index.n > example_stride) + example_stride = index.n; + example_stride++; + + KALDI_ASSERT(example_stride == inputs[0]->supervision.num_sequences); + output->indexes.clear(); output->indexes.reserve(num_indexes); for (int32 n = 0; n < num_inputs; n++) { @@ -233,8 +241,8 @@ static void MergeSupervision( // change the 'n' index to correspond to the index into 'input'. // Each example gets a different 'n' value, starting from 0. for (; iter != end; ++iter) { - KALDI_ASSERT(iter->n == 0 && "Merging already-merged chain egs"); - iter->n = n; + KALDI_ASSERT(iter->n < example_stride); + iter->n += n * example_stride; } } KALDI_ASSERT(output->indexes.size() == num_indexes); @@ -259,6 +267,7 @@ static void MergeSupervision( } } } + output->chunks_per_spk = example_stride; output->CheckDim(); } diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index adbfae95794..555c83416c3 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -81,7 +81,13 @@ static void GetIoSizes(const std::vector &src, } - +static int32 FindMaxNValue(const NnetIo &io) { + int32 max_n = 0; + for (auto &index: io.indexes) + if (index.n > max_n) + max_n = index.n; + return max_n; +} // Do the final merging of NnetIo, once we have obtained the names, dims and // sizes for each feature/supervision type. @@ -98,6 +104,9 @@ static void MergeIo(const std::vector &src, // The features in the different NnetIo in the Indexes across all examples std::vector > output_lists(num_feats); + // This is 1 for single examples and larger than 1 for already-merged egs, and + // it must be the same for all io's across all examples: + int32 example_stride = FindMaxNValue(src[0].io[0]) + 1; // Initialize the merged_eg merged_eg->io.clear(); merged_eg->io.resize(num_feats); @@ -137,11 +146,8 @@ static void MergeIo(const std::vector &src, std::vector::iterator output_iter = output_io.indexes.begin(); // Set the n index to be different for each of the original examples. for (int32 i = this_offset; i < this_offset + this_size; i++) { - // we could easily support merging already-merged egs, but I don't see a - // need for it right now. - KALDI_ASSERT(output_iter[i].n == 0 && - "Merging already-merged egs? Not currently supported."); - output_iter[i].n = n; + KALDI_ASSERT(output_iter[i].n < example_stride); + output_iter[i].n += n * example_stride; } this_offset += this_size; // note: this_offset is a reference. } From 36323a8d61fefa5ae4ba2a7a923af5541df57a0b Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 25 Dec 2018 15:26:02 -0800 Subject: [PATCH 43/87] [src] more bug-fixes, finish more code --- src/adapt/differentiable-transform-itf.h | 14 +- src/chain/chain-training.h | 1 + src/nnet3/nnet-utils.h | 2 +- src/nnet3a/Makefile | 8 +- src/nnet3a/nnet-chaina-training.cc | 681 ++++++++++++++++++----- src/nnet3a/nnet-chaina-training.h | 403 ++++++++------ src/nnet3a/nnet-chaina-utils.h | 143 +++++ src/nnet3a/notes.update | 30 +- 8 files changed, 971 insertions(+), 311 deletions(-) diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index 13983c8213f..dc68471ae0f 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -156,10 +156,11 @@ class DifferentiableTransform { will typically be output by a neural net, the 'bottom' net in our terminology. This will correspond to a whole minibatch, consisting of multiple speakers and multiple sequences (chunks) - per speaker. Caution: the order of both the input and - output features, and the posteriors, does not consist of blocks, - one per sequence, but rather blocks, one per time frame, so the - sequences are intercalated. This is the default order in + per speaker. Caution: in the input and + output features, and the posteriors, the 't' has the larger + stride than the minibatch-index 'n', so the order is: + first frame of all sequences; then the second frame of + all sequences; and so on. This is the default order in nnet3; see operator < of nnet3::Index. @param [in] num_chunks The number of individual sequences (e.g., chunks of speech) represented in 'input'. @@ -174,9 +175,10 @@ class DifferentiableTransform { form, the class supervision information that is used for the adaptation. posteriors.size() will be equal to input.NumRows(), and the ordering of its elements is the same as the ordering - of the rows of input, i.e. the sequences are intercalated. + of the rows of input (i.e. the 't' has the larger stride). There is no assumption that the posteriors sum to one; - this allows you to do things like silence weighting. + this allows you to do things like silence weighting. But + the posteriors are expected to be nonnegative. @param [out] output The adapted output. This matrix should have the same dimensions as 'input'. It does not have to be free of NaNs when you call this function. diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 63e03c7e35f..0a1191c0843 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -32,6 +32,7 @@ #include "lat/kaldi-lattice.h" #include "matrix/kaldi-matrix.h" #include "hmm/transition-model.h" +#include "hmm/posterior.h" #include "chain/chain-den-graph.h" #include "chain/chain-supervision.h" diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 7e68291b9dc..60a18f15d84 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -377,7 +377,7 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, num_max_change_per_component_applied, int32 *num_max_change_global_applied); -class MaxChangeStats; +struct MaxChangeStats; // This overloaded version of UpdateNnetWithMaxChange() is a convenience // wrapper for when you have a MaxChangeStats object to keep track diff --git a/src/nnet3a/Makefile b/src/nnet3a/Makefile index dfa01545af5..5410c54f525 100644 --- a/src/nnet3a/Makefile +++ b/src/nnet3a/Makefile @@ -6,18 +6,18 @@ all: include ../kaldi.mk -TESTFILES = +TESTFILES = nnet-chaina-utils-test nnet-chaina-training-test -OBJFILES = nnet-chaina-training.o +OBJFILES = nnet-chaina-training.o nnet-chaina-utils.o LIBNAME = kaldi-nnet3a ADDLIBS = ../fstext/kaldi-fstext.a ../chain/kaldi-chain.a \ - ../nnet3/kaldi-nnet3.a ../adapt/kaldi-adapt.a \ + ../nnet3/kaldi-nnet3.a ../adapt/kaldi-adapt.a \ ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \ ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../lat/kaldi-lat.a \ - ../matrix-kaldi-matrix.a ../util/kaldi-util.a \ + ../matrix/kaldi-matrix.a ../util/kaldi-util.a \ ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index e2f58bf9c79..79940238e05 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -1,7 +1,6 @@ // nnet3/nnet-chaina-training.cc -// Copyright 2015 Johns Hopkins University (author: Daniel Povey) -// 2016 Xiaohui Zhang +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -25,6 +24,142 @@ namespace kaldi { namespace nnet3 { +NnetChainaModels::NnetChainaModels( + bool zero_component_stats, + const std::string &model_dir, + const std::string &den_fst_dir, + const std::string &transform_dir): + zero_component_stats_(zero_component_stats), + model_dir_(model_dir), + den_fst_dir_(den_fst_dir), + transform_dir_(transform_dir) { + std::string bottom_nnet_name; // model_dir/bottom.raw + GetPathname(model_dir, "bottom", "raw", &bottom_nnet_name); + ReadKaldiObject(bottom_nnet_name, &bottom_nnet_); + if (zero_component_stats_) + ZeroComponentStats(&bottom_nnet_); + ComputeSimpleNnetContext(bottom_nnet_, + &bottom_nnet_left_context_, + &bottom_nnet_right_context_); +} + +void NnetChainaModels::GetPathname(const std::string &dir, + const std::string &name, + const std::string &suffix, + std::string *pathname) { + std::ostringstream str; + str << dir << '/' << name << '.' << suffix; + *pathname = str.str(); +} + +void NnetChainaModels::GetPathname(const std::string &dir, + const std::string &name, + int32 job_id, + const std::string &suffix, + std::string *pathname) { + std::ostringstream str; + str << dir << '/' << name << '.' << job_id << '.' << suffix; + *pathname = str.str(); +} + +NnetChainaModels::LanguageInfo *NnetChainaModels::GetInfoForLang( + const std::string &lang) { + auto iter = lang_info_.find(lang); + if (iter != lang_info_.end()) { + return iter->second; + } else { + LanguageInfo *info = new LanguageInfo(); + + std::string model_filename, den_fst_filename, transform_filename; + GetPathname(model_dir_, lang, "mdl", &model_filename); + GetPathname(den_fst_dir_, lang, "fst", &den_fst_filename); + GetPathname(transform_dir_, lang, "ada", &transform_filename); + + { + bool binary; + Input ki(model_filename, &binary); + info->trans_model.Read(ki.Stream(), binary); + info->am_nnet.Read(ki.Stream(), binary); + if (zero_component_stats_) { + ZeroComponentStats(&(info->am_nnet.GetNnet())); + } + } + ReadFstKaldi(den_fst_filename, &(info->den_fst)); + { + bool binary; + Input ki(transform_filename, &binary); + info->transform = differentiable_transform::DifferentiableTransform::ReadNew( + ki.Stream(), binary); + } + lang_info_[lang] = info; + return info; + } +} + +Nnet* NnetChainaModels::GetBottomNnet() { + return &bottom_nnet_; +} + + +AmNnetSimple* NnetChainaModels::GetNnetForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->am_nnet); +} + +TransitionModel* NnetChainaModels::GetTransitionModelForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->trans_model); +} + +fst::StdVectorFst* NnetChainaModels::GetDenFstForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->den_fst); +} + +Nnet* NnetChainaModels::GetRawNnetForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->am_nnet.GetNnet()); +} + +differentiable_transform::DifferentiableTransform* +NnetChainaModels::GetTransformForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return info->transform; +} + + + +void NnetChainaModels::WriteRawModels(const std::string &model_out_dir, + bool binary, + int32 job_id) { + std::string bottom_model_name; + GetPathname(model_out_dir, "bottom", job_id, "raw", &bottom_model_name); + WriteKaldiObject(bottom_nnet_, bottom_model_name, binary); + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang_name = iter->first; + LanguageInfo *info = iter->second; + { + // we write it as a 'raw' model without the TransitionModel or + // the AmNnetSimple wrapper, since we can reconstruct those parts + // from the previous iter's model. + std::string top_model_name; + GetPathname(model_out_dir, lang_name, job_id, "raw", &top_model_name); + WriteKaldiObject(info->am_nnet.GetNnet(), top_model_name, binary); + } + } +} + + +NnetChainaModels::~NnetChainaModels() { + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) + delete iter->second; +} + NnetChainaTopTrainer::NnetChainaTopTrainer( const std::string &lang_name, const NnetChainaTrainingOptions &config, @@ -51,20 +186,33 @@ NnetChainaTopTrainer::NnetChainaTopTrainer( // It would be complicated to implement, as there are various top nnets // and they would all try to read and write the same cache files. // To implement this, the best way would be to - KALDI_WARN << "The read-cache options are not currently supported."; + KALDI_WARN << "The read-cache options are not currently supported here."; } - KALDI_ASSERT(opts_.nnet_config.momentum >= 0.0 && - opts_.nnet_config.max_param_change >= 0.0); + KALDI_ASSERT(opts_.nnet_config.momentum >= 0.0); } -/** - TODO: include this somewhere. - if (num_minibatches_processed_ == 0) { - ConsolidateMemory(nnet_); - ConsolidateMemory(delta_nnet_); - } -*/ +NnetChainaTopTrainer::ComputationStructure::ComputationStructure( + bool adapted, + bool train_model, + bool need_input_deriv, + int32 num_sequences, + int32 frames_per_sequence_in, + int32 frames_per_sequence_out, + int32 first_input_t, + int32 top_subsampling_factor): + adapted(adapted), train_model(train_model), + need_input_deriv(need_input_deriv), num_sequences(num_sequences), + frames_per_sequence_in(frames_per_sequence_in), + frames_per_sequence_out(frames_per_sequence_out), + first_input_t(first_input_t), + top_subsampling_factor(top_subsampling_factor) { } + + +void NnetChainaTopTrainer::ConsolidateMemory() { + ::kaldi::nnet3::ConsolidateMemory(nnet_); + ::kaldi::nnet3::ConsolidateMemory(delta_nnet_); +} std::shared_ptr NnetChainaTopTrainer::GetComputation( @@ -81,25 +229,36 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( first_output_t = 0, top_subsampling_factor = s.top_subsampling_factor; - ComputationRequest request; - request.need_model_derivative = opts_.train_top_nnet; + if (nnet_->InputDim("input") < 0 || + nnet_->OutputDim("output") < 0 || + nnet_->OutputDim("output-si") < 0 || + nnet_->OutputDim("output-xent") < 0 || + nnet_->OutputDim("output-si-xent") < 0) { + KALDI_ERR << "Top neural net for chaina training must have an input called " + "'input' and outputs called 'output', 'output-xent', 'output-si', and " + "'output-si-xent'."; + } + ComputationRequest request; + request.need_model_derivative = s.train_model; request.store_component_stats = true; request.inputs.resize(1); request.inputs[0].name = "input"; request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); request.inputs[0].has_deriv = s.need_input_deriv; - // The inputs are in the order: all frames of sequence 0; then all frames of - // sequence 1; and so on. This is done + // The inputs are in the order: the first frame of all sequences; the second + // frame of all sequences; and so on. auto iter = request.inputs[0].indexes.begin(); - for (int32 n = 0; n < num_sequences; n++) { - for (int32 t = first_input_t; - t < first_input_t + frames_per_sequence_in; ++t,++iter) { + for (int32 t = first_input_t; + t < first_input_t + frames_per_sequence_in; ++t) { + for (int32 n = 0; n < num_sequences; ++n,++iter) { iter->n = n; iter->t = t; + // the x values will already be 0, thanks to the default constructor of + // Index(). } } - // ... but the outputs are in the order: the first frame of all sequences; + // The outputs are also in the order: the first frame of all sequences; // the second frame of all sequences; and so on. request.outputs.resize(2); request.outputs[0].name = (s.adapted ? "output" : "output-si"); @@ -128,9 +287,10 @@ bool NnetChainaTopTrainer::TrainUnadapted( const CuMatrixBase &input, const NnetComputation &computation, const chain::Supervision &supervision, + BaseFloat model_training_scale, const CuVectorBase &deriv_weights, Posterior *posterior, - CuMatrixBase *input_deriv) { + CuMatrix *input_deriv) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; @@ -140,15 +300,10 @@ bool NnetChainaTopTrainer::TrainUnadapted( NnetComputer computer(nnet_config.compute_config, computation, nnet_, delta_nnet_); - // Freeze the natural gradient. We dont want to update the NG scatter - // matrices on this data because we'll next be running the same nnet on the - // speaker-adapted version of the same data, and it would violate the - // independence assumptions needed for NG to work if we updated them. - FreezeNaturalGradient(true, delta_nnet_); - - // give the inputs to the computer object. + // Give the inputs to the computer object. CuMatrix input_copy(input); computer.AcceptInput("input", &input_copy); + // Do the forward propagation. computer.Run(); const CuMatrixBase @@ -159,8 +314,9 @@ bool NnetChainaTopTrainer::TrainUnadapted( kUndefined), output_xent_deriv; - // Note: we don't normally use the l2 term any more, parameter-level - // regularization seems to work better. + // Note: we normally turn the chain l2 regularization (which is l2 on the + // output of the nnet) off now, since parameter-level l2 regularization seems + // to work better. So expect 'tot_l2_term' to be zero. BaseFloat tot_objf, tot_l2_term, tot_weight; ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, @@ -169,15 +325,23 @@ bool NnetChainaTopTrainer::TrainUnadapted( &output_deriv, &output_xent_deriv, posterior); + if (!(tot_objf - tot_objf == 0.0)) { + // A NaN or inf was encountered in the objective computation. + // The input_deriv won't be used, so no need to set it. + // Un-freeze the natural gradient and return. + return false; + } + { // this block computes and keeps track of the cross-entropy objective. // at this point, xent_deriv is posteriors derived from the numerator - // computation. note, xent_objf has a factor of '.supervision.weight' + // computation. note, xent_objf has a factor of '.supervision.weight', + // which is also included in 'tot_weight'. BaseFloat xent_objf = TraceMatMat(output_xent, output_xent_deriv, kTrans); output_si_xent_objf_.UpdateStats(lang_name_ + ":output-si-xent", - opts_.nnet_config.print_interval, - num_minibatches_processed_, - tot_weight, xent_objf); + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, xent_objf); } if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) { @@ -185,58 +349,72 @@ bool NnetChainaTopTrainer::TrainUnadapted( output_xent_deriv.MulRowsVec(deriv_weights); } - if (opts_.unadapted_deriv_scale != 1.0) - output_deriv.Scale(opts_.unadapted_deriv_scale); - - computer.AcceptInput("output-si", &output_deriv); - - output_xent_deriv.Scale(opts_.chain_config.xent_regularize * - opts_.unadapted_deriv_scale); - computer.AcceptInput("output-si-xent", &output_xent_deriv); - output_si_objf_.UpdateStats(lang_name_ + ":output-si", opts_.nnet_config.print_interval, num_minibatches_processed_, tot_weight, tot_objf, tot_l2_term); - // Do the backprop. We know we're either updating the nnet or need the - // input derivatives (else, what point is there in training), so there - // must be a backprop pass. - computer.Run(); + if (input_deriv == NULL && model_training_scale == 0.0) + return true; - if (input_deriv != NULL) { - input_deriv->AddMat(opts_.unadapted_backprop_scale, - computer.GetOutput("input")); - } + // Freeze the natural gradient. We don't want to update the NG scatter + // matrices on this data because we'll next be running the same nnet on the + // speaker-adapted version of the same data, and it would violate the + // independence assumptions needed for NG to work if we updated them. + if (model_training_scale != 0.0) + FreezeNaturalGradient(true, delta_nnet_); - // Updates the parameters of nnet. Since the derivatives will all be scaled - // with "unadapted_deriv_scale" it makes sense to apply that same factor to - // the max-change, to keep the max-change in proportion with how much we - // expect the net to change (so smaller max-change values don't lead to more - // emphasize on the unadapted model's derivatives) - bool success = UpdateNnetWithMaxChange( - *delta_nnet_, - nnet_config.max_param_change, - opts_.unadapted_deriv_scale, - 1.0 - nnet_config.momentum, // normally momentum is 0.0. - nnet_, &max_change_stats_si_); + computer.AcceptInput("output-si", &output_deriv); + + output_xent_deriv.Scale(opts_.chain_config.xent_regularize); + computer.AcceptInput("output-si-xent", &output_xent_deriv); - // Un-freeze the natural gradient. - FreezeNaturalGradient(false, delta_nnet_); + // Do the backprop. + computer.Run(); - if (!success) - ScaleNnet(nnet_config.momentum, delta_nnet_); - else - ScaleNnet(0.0, delta_nnet_); - return success; + if (input_deriv != NULL) + computer.GetOutputDestructive("input", input_deriv); + + static bool warned_momentum = false; + if (model_training_scale != 1.0 && + nnet_config.momentum != 0.0 && !warned_momentum) { + KALDI_WARN << "Momentum does not interact correctly with top_weight or " + "bottom_weight values. Will not warn again."; + warned_momentum = true; + } + + if (model_training_scale != 0.0) { + // If we're actually training the top model... + + // Update the parameters of nnet. + // Note: normally momentum is 0.0. + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + 1.0, + model_training_scale * (1.0 - nnet_config.momentum), + nnet_, &max_change_stats_si_); + + // Un-freeze the natural gradient. + FreezeNaturalGradient(false, delta_nnet_); + + if (success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); + return success; + } else { + return true; + } } bool NnetChainaTopTrainer::TrainAdapted( - const CuMatrixBase &input, const NnetComputation &computation, const chain::Supervision &supervision, + BaseFloat model_training_scale, const CuVectorBase &deriv_weights, - CuMatrixBase *input_deriv) { + CuMatrix *input, + CuMatrix *input_deriv) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; @@ -246,9 +424,9 @@ bool NnetChainaTopTrainer::TrainAdapted( NnetComputer computer(nnet_config.compute_config, computation, nnet_, delta_nnet_); - // give the inputs to the computer object. - CuMatrix input_copy(input); - computer.AcceptInput("input", &input_copy); + // give the input to the computer object. + computer.AcceptInput("input", input); + // Do the forward computation computer.Run(); const CuMatrixBase @@ -259,8 +437,9 @@ bool NnetChainaTopTrainer::TrainAdapted( kUndefined), output_xent_deriv; - // Note: we don't normally use the l2 term any more, parameter-level - // regularization seems to work better. + // Note: we don't normally use the l2 term any more; parameter-level + // regularization seems to work better than regularization of the + // nnet output. BaseFloat tot_objf, tot_l2_term, tot_weight; ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, @@ -268,6 +447,12 @@ bool NnetChainaTopTrainer::TrainAdapted( &tot_objf, &tot_l2_term, &tot_weight, &output_deriv, &output_xent_deriv); + if (!(tot_objf - tot_objf == 0.0)) { + // A NaN or inf was encountered in the objective computation. the input_deriv + // won't be used by the calling code, so no need to set it. + return false; + } + { // this block computes and keeps track of the cross-entropy objective. // at this point, xent_deriv is posteriors derived from the numerator @@ -278,6 +463,13 @@ bool NnetChainaTopTrainer::TrainAdapted( num_minibatches_processed_, tot_weight, xent_objf); } + output_objf_.UpdateStats(lang_name_ + ":output", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, tot_l2_term); + + if (input_deriv == NULL && model_training_scale == 0.0) + return true; if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) { output_deriv.MulRowsVec(deriv_weights); @@ -288,48 +480,40 @@ bool NnetChainaTopTrainer::TrainAdapted( output_xent_deriv.Scale(opts_.chain_config.xent_regularize); computer.AcceptInput("output-xent", &output_xent_deriv); - output_objf_.UpdateStats(lang_name_ + ":output", - opts_.nnet_config.print_interval, - num_minibatches_processed_, - tot_weight, tot_objf, tot_l2_term); - - if (input_deriv == NULL && !opts_.train_top_nnet) { - // We're neither training the top model nor need the input derivatives. - // E.g., we might be just getting stats for batch normalization after - // training the model. - return true; - } - - // Do the backprop. We know we're either updating the nnet or need the - // input derivatives (else, what point is there in training), so there - // must be a backprop pass. + // Do the backprop. computer.Run(); - if (input_deriv != NULL) { - input_deriv->AddMat(1.0, computer.GetOutput("input")); + if (input_deriv != NULL) + computer.GetOutputDestructive("input", input_deriv); + + if (model_training_scale != 0.0) { + // If we're actually training the top model... + + // Update the parameters of nnet. + // Note: normally, momentum is 0.0. + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + 1.0, + model_training_scale * (1.0 - nnet_config.momentum), + nnet_, &max_change_stats_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when, later on, we use the model with batchnorm test-mode set). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + + if (success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); + return success; + } else { + return true; } - - // Update the parameters of nnet. - bool success = UpdateNnetWithMaxChange( - *delta_nnet_, - nnet_config.max_param_change, - 1.0, - 1.0 - nnet_config.momentum, // normally momentum is 0.0. - nnet_, &max_change_stats_); - - // Scale down the batchnorm stats (keeps them fresh... this affects what - // happens when, later on, we use the model with batchnorm test-mode set). - ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); - - // The following will only do something if we have a LinearComponent - // or AffineComponent with orthonormal-constraint set to a nonzero value. - ConstrainOrthonormal(nnet_); - - if (!success) - ScaleNnet(nnet_config.momentum, delta_nnet_); - else - ScaleNnet(0.0, delta_nnet_); - return success; } @@ -340,31 +524,44 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, int32 top_subsampling_factor, const VectorBase &deriv_weights_in, const chain::Supervision &supervision, - CuMatrixBase *input_deriv) { + BaseFloat model_training_scale, + CuMatrix *input_deriv) { KALDI_ASSERT(input.NumRows() != 0 && input.NumRows() % num_sequences != 0); int32 frames_per_sequence_in = input.NumRows() / num_sequences, frames_per_sequence_out = supervision.frames_per_sequence; bool adapted = false; ComputationStructure structure( - adapted, (input_deriv != NULL), + adapted, (model_training_scale != 0.0), (input_deriv != NULL), num_sequences, frames_per_sequence_in, frames_per_sequence_out, first_input_t, top_subsampling_factor); + // Will be the numerator posterior from the unadapted pass, which will be + // padded with l/r context and used to estimate the adapted features. Posterior post; - CuVector deriv_weights(deriv_weights_in); + CuVector deriv_weights; + if (opts_.apply_deriv_weights) + deriv_weights = deriv_weights_in; std::shared_ptr computation_unadapted = GetComputation(structure); - if (!TrainUnadapted(input, *computation_unadapted, supervision, - deriv_weights, &post, input_deriv)) { + bool success = TrainUnadapted( + input, *computation_unadapted, supervision, + model_training_scale * opts_.unadapted_top_weight, + deriv_weights, &post, input_deriv); + + if (!success) { num_minibatches_processed_++; - if (input_deriv) - input_deriv->SetZero(); return false; } + if (input_deriv) { + // Apply the scale from --unadapted-bottom-weight. We'll supply the other + // factor that comes from from the language-specific bottom_weight ("bw") + // ito UpdateNnetWithMaxChange() later on when we train the bottom nnet. + input_deriv->Scale(opts_.unadapted_bottom_weight); + } Posterior post_padded(input.NumRows()); ConvertPosterior(post, num_sequences, first_input_t, @@ -376,32 +573,69 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, CuMatrix adapted_input(input.NumRows(), input.NumCols(), kUndefined), - adapted_input_deriv(input.NumRows(), input.NumCols()); + adapted_input_deriv; using namespace differentiable_transform; MinibatchInfoItf *minibatch_info = transform_.TrainingForward( input, num_sequences, num_spk, post_padded, &adapted_input); - if (!TrainAdapted(adapted_input, *computation_adapted, supervision, - deriv_weights, &adapted_input_deriv)) { - num_minibatches_processed_++; - if (input_deriv) - input_deriv->SetZero(); + success = TrainAdapted( + *computation_adapted, supervision, + model_training_scale, deriv_weights, + &adapted_input, &adapted_input_deriv); + + num_minibatches_processed_++; + if (!success) return false; - } - if (input_deriv == NULL) { + if (input_deriv == NULL) delete minibatch_info; - } else { + else transform_.TrainingBackward(input, adapted_input_deriv, num_sequences, num_spk, post_padded, minibatch_info, input_deriv); - } - num_minibatches_processed_++; return true; } +void NnetChainaTopTrainer::ConvertPosterior( + const Posterior &post_at_output, + int32 num_sequences, + int32 first_input_t, + int32 top_subsampling_factor, + Posterior *post_at_input) { + int32 output_post_size = post_at_output.size(), + input_post_size = post_at_input->size(), + s = top_subsampling_factor; + KALDI_ASSERT(input_post_size % num_sequences == 0 && + output_post_size % num_sequences == 0 && + input_post_size >= output_post_size * top_subsampling_factor && + top_subsampling_factor > 0); + int32 num_frames_out = output_post_size / num_sequences, + num_frames_in = input_post_size / num_sequences, + last_input_t = first_input_t + (num_frames_in - 1), + first_output_t = 0, + last_output_t = first_output_t + s * (num_frames_out - 1); + + int32 half_s = s / 2; // note: this will round down, which is intended. + + for (int32 t_in = first_input_t; t_in <= last_input_t; t_in++) { + // find the corresponding output frame by rounding t to the closest + // t that's a multiple of top_subsampling_factor (rounding down in + // case of ties). We do this by adding half_s and rounding down. + int32 t_out = s * DivideRoundingDown(t_in + half_s, s); + if (t_out >= first_output_t && t_out <= last_output_t) { + for (int32 n = 0; n < num_sequences; n++) { + int32 input_index = num_sequences * (t_in - first_input_t) + n, + output_index = num_sequences * ((t_out - first_output_t) / s) + n; + (*post_at_input)[input_index] = post_at_output[output_index]; + } + } + // else just leave the posterior for this frame empty. This will happen for + // most of the frames that were added for left and right context. + } +} + bool NnetChainaTopTrainer::PrintTotalStats() const { bool ans = false; if (output_si_objf_.PrintTotalStats(lang_name_ + ":output-si")) @@ -422,16 +656,26 @@ bool NnetChainaTopTrainer::PrintTotalStats() const { } +NnetChainaTopTrainer::~NnetChainaTopTrainer() { + delete delta_nnet_; +} + +void NnetChainaBottomTrainer::ConsolidateMemory() { + ::kaldi::nnet3::ConsolidateMemory(nnet_); + ::kaldi::nnet3::ConsolidateMemory(delta_nnet_); +} + NnetComputer* NnetChainaBottomTrainer::Forward( int32 num_sequences, int32 first_input_t, int32 first_output_t, int32 frames_per_sequence_out, + bool train_model, CuMatrix *input, CuMatrix *output) { KALDI_ASSERT(input->NumRows() != 0 && input->NumRows() % num_sequences == 0); int32 frames_per_sequence_in = input->NumRows() / num_sequences; - ComputationStructure s(opts_.train_bottom_nnet, + ComputationStructure s(train_model, num_sequences, frames_per_sequence_in, frames_per_sequence_out, @@ -447,24 +691,35 @@ NnetComputer* NnetChainaBottomTrainer::Forward( computer->AcceptInput("input", input); computer->Run(); computer->GetOutputDestructive("output", output); - return computer; + if (!train_model) { + delete computer; + return NULL; + } else { + return computer; + } } -void NnetChainaBottomTrainer::Backward(NnetComputer *computer, +void NnetChainaBottomTrainer::Backward(BaseFloat model_training_scale, + NnetComputer *computer, CuMatrix *output_deriv) { + // if model_training_scale was 0.0, this function should not have been called. + KALDI_ASSERT(model_training_scale > 0.0); computer->AcceptInput("output", output_deriv); computer->Run(); + delete computer; + const NnetTrainerOptions &nnet_config = opts_.nnet_config; // we may later provide a way to set a different max-change for the bottom // nnet than on the top nnet. + // Note: normally, momentum is 0.0. bool success = UpdateNnetWithMaxChange( *delta_nnet_, nnet_config.max_param_change, 1.0, - 1.0 - nnet_config.momentum, // normally momentum is 0.0. + model_training_scale * (1.0 - nnet_config.momentum), nnet_, &max_change_stats_); @@ -476,11 +731,18 @@ void NnetChainaBottomTrainer::Backward(NnetComputer *computer, // or AffineComponent with orthonormal-constraint set to a nonzero value. ConstrainOrthonormal(nnet_); - if (!success) + if (success) ScaleNnet(nnet_config.momentum, delta_nnet_); else ScaleNnet(0.0, delta_nnet_); + static bool warned_momentum = false; + if (model_training_scale != 1.0 && nnet_config.momentum != 0.0 && + !warned_momentum) { + KALDI_WARN << "Momentum does not interact correctly with top_weight or " + "bottom_weight values. Will not warn again."; + warned_momentum = true; + } num_minibatches_processed_++; } @@ -521,15 +783,21 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( first_input_t = s.first_input_t, first_output_t = s.first_output_t; + if (nnet_->InputDim("input") < 0 || + nnet_->OutputDim("output") < 0) { + KALDI_ERR << "Bottom neural net for chaina training must have an input " + "called 'input' and an output called 'output'."; + } + ComputationRequest request; - request.need_model_derivative = opts_.train_bottom_nnet; + request.need_model_derivative = s.train_model; request.store_component_stats = true; request.inputs.resize(1); request.inputs[0].name = "input"; request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); // The inputs are in the order: all frames of sequence 0; then all frames of // sequence 1; and so on. This is how the example-merging code does it, since - // it's more convenient when dealing with compressed matrices and the like. + // it's more convenient when dealing with compressed matrices. auto iter = request.inputs[0].indexes.begin(); for (int32 n = 0; n < num_sequences; n++) { for (int32 t = first_input_t; @@ -542,7 +810,7 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( // the second frame of all sequences; and so on. request.outputs.resize(1); request.outputs[0].name = "output"; - request.outputs[1].has_deriv = opts_.train_bottom_nnet; + request.outputs[0].has_deriv = s.train_model; request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences); int32 t_stride_out = opts_.bottom_subsampling_factor; iter = request.outputs[0].indexes.begin(); @@ -561,6 +829,20 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( } +NnetChainaBottomTrainer::~NnetChainaBottomTrainer() { + delete delta_nnet_; +} + + +void NnetChainaTrainer::GetContextInfo( + const std::string &lang, + int32 *bottom_left_context, + int32 *bottom_right_context, + int32 *top_left_context, + int32 *top_right_context) { + +} + bool NnetChainaTrainer::PrintTotalStats() const { bottom_trainer_.PrintTotalStats(); bool ans = false; @@ -577,13 +859,125 @@ NnetChainaTrainer::NnetChainaTrainer( opts_(config), models_(models), bottom_trainer_(opts_, models->GetBottomNnet()) { + ComputeSimpleNnetContext(*models->GetBottomNnet(), + &bottom_left_context_, + &bottom_right_context_); } -void NnetChainaTrainer::Train(const NnetChainExample &eg) { - // TODO. work out structure, etc. +NnetChainaTopTrainer* NnetChainaTrainer::GetTopTrainerForLang( + const std::string &lang) { + auto iter = top_trainers_.find(lang); + if (iter != top_trainers_.end()) + return iter->second; + NnetChainaTopTrainer *ans = + new NnetChainaTopTrainer( + lang, opts_, + *(models_->GetDenFstForLang(lang)), + *(models_->GetTransformForLang(lang)), + models_->GetRawNnetForLang(lang)); + top_trainers_[lang] = ans; + return ans; } +// 'key' might be something like "afsdadsfds12345?lang=english&tw=1.0&bw=0.5" +// expressing how much we want this eg to be used to train the top, and bottom, +// models respectively. +void NnetChainaTrainer::Train(const std::string &key, + const NnetChainExample &eg) { + size_t num_top_trainers = top_trainers_.size(); + std::string lang_name = "default"; + // 'top_weight' is a weight on the derivatives and max-change + // when training the top model, 'bottom_weight' is the same + // for the bottom model. + BaseFloat top_weight = 1.0, + bottom_weight = 1.0; + ParseFromQueryString(key, "lang", &lang_name); + ParseFromQueryString(key, "tw", &top_weight); + ParseFromQueryString(key, "bw", &bottom_weight); + if (!(top_weight >= 0.0 && bottom_weight >= 0.0 && + (top_weight > 0.0 || bottom_weight > 0.0))) + KALDI_ERR << "Either the top or bottom weight " + "must be nonzero; neither can be negative: key=" << key; + + int32 num_sequences, chunks_per_spk, first_input_t, + num_input_frames, num_output_frames, + frame_subsampling_factor, + eg_left_context, eg_right_context; + FindChainaExampleStructure(eg, &num_sequences, &chunks_per_spk, + &first_input_t, + &num_input_frames, &num_output_frames, + &frame_subsampling_factor, + &eg_left_context, &eg_right_context); + KALDI_ASSERT(chunks_per_spk % num_sequences == 0); + int32 num_spk = num_sequences / chunks_per_spk; + + AmNnetSimple *top_am_nnet = models_->GetNnetForLang(lang_name); + int32 top_left_context = top_am_nnet->LeftContext(), + top_right_context = top_am_nnet->RightContext(); + + int32 first_embedding_t, + num_embedding_frames; + ComputeEmbeddingTimes(first_input_t, num_input_frames, num_output_frames, + frame_subsampling_factor, + opts_.bottom_subsampling_factor, + bottom_left_context_, bottom_right_context_, + top_left_context, top_right_context, + opts_.keep_embedding_context, + &first_embedding_t, &num_embedding_frames); + + const GeneralMatrix &eg_input = eg.inputs[0].features; + CuMatrix cu_input(eg_input.NumRows(), eg_input.NumCols(), + kUndefined), + cu_embedding; + eg_input.CopyToMat(&cu_input); + bool train_bottom_nnet = (bottom_weight != 1.0); + KALDI_ASSERT(cu_input.NumRows() == num_input_frames * num_sequences); + + NnetComputer *computer = bottom_trainer_.Forward( + num_sequences, first_input_t, + first_embedding_t, num_embedding_frames, + train_bottom_nnet, + &cu_input, &cu_embedding); + + int32 b = opts_.bottom_subsampling_factor, + first_embedding_t_subsampled = first_embedding_t / b, + top_subsampling_factor = frame_subsampling_factor / b; + + NnetChainaTopTrainer *top_trainer = GetTopTrainerForLang(lang_name); + + CuMatrix cu_embedding_deriv; + if (train_bottom_nnet) + cu_embedding_deriv.Resize(cu_embedding.NumRows(), cu_embedding.NumCols()); + + + bool success = top_trainer->Train(cu_embedding, num_sequences, + num_spk, + first_embedding_t_subsampled, + top_subsampling_factor, + eg.outputs[0].deriv_weights, + eg.outputs[0].supervision, + top_weight, + (train_bottom_nnet ? + &cu_embedding_deriv : NULL)); + + if (success && train_bottom_nnet) { + bottom_trainer_.Backward(bottom_weight, computer, + &cu_embedding_deriv); + } else { + delete computer; // if it's NULL, this will do nothing. + } + + if (top_trainers_.size() != num_top_trainers) { + // Move any permanently held bits of GPU memory to low addresses, to reduce + // fragmentation. + bottom_trainer_.ConsolidateMemory(); + top_trainer->ConsolidateMemory(); + } + +} + + NnetChainaTrainer::~NnetChainaTrainer() { for (auto iter = top_trainers_.begin(); iter != top_trainers_.end(); ++iter) @@ -591,5 +985,6 @@ NnetChainaTrainer::~NnetChainaTrainer() { } + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index a8764e14133..b077d3b3ecd 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -38,29 +38,36 @@ struct NnetChainaTrainingOptions { NnetTrainerOptions nnet_config; chain::ChainTrainingOptions chain_config; bool apply_deriv_weights; - BaseFloat unadapted_deriv_scale; - BaseFloat unadapted_backprop_scale; - bool train_bottom_nnet; // True if we will be training the bottom nnet. - bool train_top_nnet; // True if we will be training the top nnet. + BaseFloat unadapted_top_weight; + BaseFloat unadapted_bottom_weight; int32 bottom_subsampling_factor; bool keep_embedding_context; NnetChainaTrainingOptions(): apply_deriv_weights(true), - unadapted_deriv_scale(0.5), - unadapted_backprop_scale(1.0), - train_bottom_nnet(true), - train_top_nnet(true), + unadapted_top_weight(1.0), + unadapted_bottom_weight(0.5), bottom_subsampling_factor(1), keep_embedding_context(true) { } void Register(OptionsItf *opts) { nnet_config.Register(opts); chain_config.Register(opts); - opts->Register("train-bottom-nnet", &train_bottom_nnet, - "Set this to false to disable training of bottom nnet."); - opts->Register("train-top-nnet", &train_top_nnet, - "Set this to false to disable training of top nnet."); + opts->Register("apply-deriv-weights", &apply_deriv_weights, + "If true, apply the per-frame derivative weights stored with " + "the example"); + opts->Register("unadapted-top-weight", &unadapted_top_weight, + "Scale used for the step sizes and max-change values when " + "training the top nnet and evaluating the unadapted output. " + "Affects how strongly the top nnets are trained by the " + "unadapted embeddings. The scale on the adapted branch is " + "implicitly 1.0, but all these numbers also get multiplied " + "by language-specific weights obtained from the egs."); + opts->Register("unadapted-bottom-weight", &unadapted_bottom_weight, + "Scale that is applied to the derivatives arising from the " + "unadapted branch of the top nnets, when training the bottom " + "nnet. Affects how much we prioritize the unadapted " + "features for bottom nnet training."); opts->Register("bottom-subsampling-factor", &bottom_subsampling_factor, "Determines the frequency at which we subsample the " "embeddings from the bottom nnet. Implicitly, the " @@ -76,44 +83,26 @@ struct NnetChainaTrainingOptions { "optional dependencies (for example: if it uses " "StatisticsExtractionComponent, IfDefined(), Failover(), " "etc.)."); - opts->Register("apply-deriv-weights", &apply_deriv_weights, - "If true, apply the per-frame derivative weights stored with " - "the example"); - opts->Register("unadapted-deriv-scale", &unadapted_deriv_scale, - "Scale on the derivatives (and max-change values, for the top " - "nnet) for the unadapted branches of the nnets (at the outputs " - "output-si and output-si-xent. Affects how strongly the nnets " - "are trained by the unadapted embeddings. Note: this also " - "affects the derivatives given to the bottom nnet. The scale " - "on the adapted branch is implicitly 1.0."); - opts->Register("unadapted-backprop-scale", &unadapted_backprop_scale, - "Scale that is applied to the derivatives arising from the " - "unadapted branch of the top nnets, when backpropagating " - "to the embeddings. Affects how much we prioritize the " - "unadapted features. Note: this is effectively multiplied by " - "unadapted-deriv-scale; unadapted-deriv-scale also affects " - "training of the top nnet."); } void Check() { - KALDI_ASSERT(unadapted_deriv_scale > 0.0 && - unadapted_backprop_scale >= 0.0); - // TODO: add more checks? + KALDI_ASSERT(unadapted_top_weight > 0.0 && + unadapted_bottom_weight >= 0.0 && + bottom_subsampling_factor > 0); } - }; /** - This struct, intended mostly to be accessed by NnetChainaTrainer, handles the + This class, intended to mostly be accessed by NnetChainaTrainer, handles the logic of reading the models and their corresponding denominator FSTs from disk, and of writing out the corresponding (raw) trained models when this iteration of training has finished. - The reason this is not entirely trivial is that we want to make it easy - to support the multilingual case. In this case there is one 'bottom' - model (the embedding extractor) but there may be multiple 'top' models, - each with their associated transition model and denominator FST, and their - own name. We use a directory to organize these. + The reason this is not entirely trivial is that we want to make it easy to + support the multilingual case. In this case there is one 'bottom' model (the + embedding extractor) but there may be multiple 'top' models, each with their + associated transition model and denominator FST, containing their own + langauge name. We use a directory to organize these. */ class NnetChainaModels { public: @@ -121,30 +110,28 @@ class NnetChainaModels { Constructor to which you pass the model directory and the den-fst directory. The directory structure is: /bottom.raw - should exist, and then for each language name "lang", the following + should exist, and then for each language name (e.g. "english"), the following files should exist: - /lang.mdl /lang.fst /lang.ada + /english.mdl /english.fst /english.ada + There is no requirement that all these directories be distinct. In practice, the language name will be either "default", in the typical (monolingual) setup, or it might be arbitrary strings - representing languages such as "english", "french" (in - + representing languages such as "english", "french", and so on. In general the language can be any string containing ASCII letters, numbers - or underscores, and it will be a suffix of the key in the egs that we are - reading, separated from them by a "-". E.g. if the key is - "143213423-1234123432_10-english", the language would be "english". - The models and denominator FSTs will only be read when they are - actually required. + or underscores. + + The models and denominator FSTs will only be read when they are actually + required, so languages that are not used by a particular job (e.g. because + they were not represented in the egs) will not actually be read. */ - NnetChainaModels(const std::string &model_dir, + NnetChainaModels(bool zero_component_stats, + const std::string &model_dir, const std::string &den_fst_dir, const std::string &transform_dir); Nnet* GetBottomNnet(); - int32 BottomNnetLeftContext() const; - int32 BottomNnetRightContext() const; - /** Returns the AmNnetSimple object corresponding to a given language name (e.g. "default", "english", "french"). Note: the model @@ -153,8 +140,7 @@ class NnetChainaModels { */ AmNnetSimple *GetNnetForLang(const std::string &language_name); - - const TransitionModel *GetTransitionModelForLang( + TransitionModel *GetTransitionModelForLang( const std::string &language_name); @@ -167,23 +153,55 @@ class NnetChainaModels { differentiable_transform::DifferentiableTransform *GetTransformForLang( const std::string &language_name); - - // Writes to 'langs' a vector (in no particular order) of the - // names of all the languages that have been loaded (this will depend - // on whether they were represented in the egs). This might - // be [ "default" ], or it might be [ "english", "french" ], for - // example. - void ListAllLanguages(std::vector *langs); - // Writes the files // /bottom..raw // and, for each language that we accessed, // /..raw void WriteRawModels(const std::string &model_out_dir, + bool binary, int32 job_id); ~NnetChainaModels(); private: + // This function sets "pathname" to the string: + // /. + void GetPathname(const std::string &dir, + const std::string &name, + const std::string &suffix, + std::string *pathname); + + // This version of GetPathname() sets "pathname" to the string: + // /.. + void GetPathname(const std::string &dir, + const std::string &name, + int32 job_id, + const std::string &suffix, + std::string *pathname); + + // struct LanguageInfo contains the data that is stored per language. + struct LanguageInfo { + // am_nnet comes from /.mdl, which also + // stores a TransitionModel. + TransitionModel trans_model; + AmNnetSimple am_nnet; + // den_fst comes from /.fst + fst::StdVectorFst den_fst; + // transform comes from /.ada + differentiable_transform::DifferentiableTransform *transform; + LanguageInfo(): transform(NULL) { } + ~LanguageInfo() { + delete transform; + } + }; + + + // get the LanguageInfo* for this language, creating it (and reading its + // contents from disk) if it does not already exist. + LanguageInfo *GetInfoForLang(const std::string &lang); + + // True if we are going to call ZeroComponentStats() on models when they are + // read. + bool zero_component_stats_; // Directory where models are located. std::string model_dir_; // Directory where denominator FSTs are located. @@ -197,41 +215,10 @@ class NnetChainaModels { int32 bottom_nnet_left_context_; int32 bottom_nnet_right_context_; - // Data that is loaded per language. - - struct LanguageInfo { - // trans_model and am_nnet come from /.mdl - TransitionModel trans_model; - AmNnetSimple am_nnet; - // den_fst comes from /.fst - fst::StdVectorFst den_fst; - // trans comes from /.ada - differentiable_transform::DifferentiableTransform *trans; - }; - - std::unordered_map lang_info_; + std::unordered_map lang_info_; }; -/** - steps of training: - - for a minibatch: - work out the language - work out how many chunks per speaker - work out the context and how many frames of embeddings are - needed. - - See whether we need backprop and model update for the two - passes of training. - Make the 3 computations. - - - - We need - - */ - /** This object, which has a similar function to NnetChainTrainer, trains the @@ -242,24 +229,27 @@ class NnetChainaTopTrainer { public: /** Constructor. - @param [in] lang_name The name of the language this corresponds to (for diagnostics). - E.g. "default", "english", etc. + @param [in] lang_name The name of the language this corresponds to + (needed for diagnostics). E.g. "default", + "english". @param [in] config Options class - @param [in] train_top_model True if we are training the 'top' model... this is one - configuration value that's outside 'config', that we need. @param [in] den_fst The denominator FST for this language @param [in] transform The transform object which will be used to produce adapted features after the first pass of training. - @param [in,out] nnet The neural net we are training. Expected to have outputs - called "output-si" (speaker-independent output), "output", - "output-si-xent", "output-xent", and an input called - "input". This class does not take ownership of the pointer. + @param [in,out] nnet The neural net we are training. Expected to have + outputs called "output-si" (speaker-independent + output), "output", "output-si-xent", "output-xent", + and an input called "input". This class does not + take ownership of the pointer, but it will modify + its parameters (and stored statistics) during + training. */ - NnetChainaTopTrainer(const std::string &lang_name, - const NnetChainaTrainingOptions &config, - const fst::StdVectorFst &den_fst, - const differentiable_transform::DifferentiableTransform &transform, - Nnet *nnet); + NnetChainaTopTrainer( + const std::string &lang_name, + const NnetChainaTrainingOptions &config, + const fst::StdVectorFst &den_fst, + const differentiable_transform::DifferentiableTransform &transform, + Nnet *nnet); /** Train on one minibatch. @param [in] input The input (unadapted) features, most likely the embeddings @@ -270,33 +260,46 @@ class NnetChainaTopTrainer { @param [in] num_sequences The number of sequences/chunks represented in 'input' (a.k.a. the minibatch size). Actually this must be equal to supervision.num_sequences, but it's easier for - reasons of clarity and documentation repeat it here. + reasons of clarity and documentation to repeat it here. @param [in] num_spk The total number of speakers. Must be >1, and must divide num_sequences. The number of sequences per speaker must be the same for all speakers (it will equal num_sequences / num_spk), and the sequences for a speaker must be consecutively numbered. - @param [in] first_input_t The 't' value corresponding to the first input - frame (will normally be a negative number, corresponding to the left - context we are giving to the 'top' model, since we assume that the - sequences have 't' values starting from 0). The 't' values at - the input will be consecutive, and the number of frames per sequence - will equal input.NumRows() / num_sequences. Note: if the embeddings - are computed at a lower frame rate than the original features, we - renumber things to make the embeddings consecutive. + @param [in] first_input_t The 't' value corresponding to the first + input frame (will normally be a negative number, + corresponding to the left context we are giving to the + 'top' model, since we renumber to ensure that the sequences + have 't' values starting from 0). The 't' values at the + input will be consecutive, and the number of frames per + sequence will equal input.NumRows() / num_sequences. Note: + if the embeddings are computed at a lower frame rate than + the original features, we renumber things to make the + embeddings consecutive. @param [in] top_subsampling_factor The subsampling factor of the top network (which will equal the frame subsampling factor implicit in the original egs that we read, divided by bottom_subsampling_factor). E.g. this might frequently be 1 or 3. The frames at the output of the 'top' nnet are evaluated for 't' values that are multiples of 'top_subsampling_factor', starting from t=0. + @param [in] deriv_weights Per-frame weights that will be applied to the derivatives + w.r.t. the objective function. Dimension is expected to be either + input.NumRows(), or zero (in which case it is treated the same as a + vector containing all ones). @param [in] supervision The chain supervision object representing the objective function at the output. Its num_sequences must equal the - num_sequences passed into this function separately. + num_sequences passed into this function as a separate argument. + @param [in] model_training_scale A scale we'll apply to the parameter changes + and max-change values when taking any step. This will be + referred to elsewhere as top_weight, or "tw" when present in + keys of egs in scp files; we'll have a separately specifiable + weight for the bottom nnet. If this is zero, we won't be training + the top model on this eg at all. @param [out] input_deriv If non-NULL, the derivative of the objective function - w.r.t. the input features will be written to here (this function assumes - that its value is zero on entry). - @return Returns true if it successfully trained on this minbiatch, false - on error (e.g. if a NaN was generated, which should not really happen). + w.r.t. the input features will be written to here (this function + will set it using Swap(), so you don't need to correctly size it). + @return Returns true if it successfully trained on this minbiatch, + false on error (e.g. if a NaN was generated, which should + not really happen). */ bool Train(const CuMatrixBase &input, int32 num_sequences, @@ -305,11 +308,16 @@ class NnetChainaTopTrainer { int32 top_subsampling_factor, const VectorBase &deriv_weights, const chain::Supervision &supervision, - CuMatrixBase *input_deriv = NULL); + BaseFloat model_training_scale, + CuMatrix *input_deriv = NULL); // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; + // Calls kaldi::nnet3::ConsolidateMemory() on nnet_ and delta_nnet_; we do + // this after the first minibatch of training, to reduce fragmentation. + void ConsolidateMemory(); + ~NnetChainaTopTrainer(); private: @@ -318,6 +326,7 @@ class NnetChainaTopTrainer { // vectors of Indexes in it. struct ComputationStructure { bool adapted; + bool train_model; bool need_input_deriv; int32 num_sequences; int32 frames_per_sequence_in; @@ -326,6 +335,7 @@ class NnetChainaTopTrainer { int32 top_subsampling_factor; inline bool operator == (const ComputationStructure &other) const { return adapted == other.adapted && + train_model == other.train_model && need_input_deriv == other.need_input_deriv && num_sequences == other.num_sequences && frames_per_sequence_in == other.frames_per_sequence_in && @@ -341,6 +351,8 @@ class NnetChainaTopTrainer { @param [in] adapted True if we want the outputs from "output" and "output-xent", and false if we want the outputs from "output-si" and "output-si-xent". + @param [in] train_model True if we will be training the acoustic + model with this example. @param [in] need_input_deriv True if we need the derivative w.r.t. the features that are the input to this computation. @param [in] num_sequences The number of sequences in this minibatch @@ -359,6 +371,7 @@ class NnetChainaTopTrainer { at frames t=0, t=3, and so on. */ ComputationStructure(bool adapted, + bool train_model, bool need_input_deriv, int32 num_sequences, int32 frames_per_sequence_in, @@ -368,7 +381,9 @@ class NnetChainaTopTrainer { }; struct ComputationHasher { inline size_t operator() (const ComputationStructure &s) const { - return size_t(s.num_sequences) + + return (s.adapted ? 33 : 0) + + (s.train_model ? 333 : 0) + + size_t(s.num_sequences) + 10 * size_t(s.frames_per_sequence_in) + 100 * size_t(s.frames_per_sequence_out) + 1000 * size_t(s.first_input_t) + @@ -399,6 +414,12 @@ class NnetChainaTopTrainer { @param [in] supervision The chain supervision object. The nnet output dimensions are worked out from this, as well as using this object to compute the objective function. + @param [in] model_training_scale A scale we'll apply to the parameter + changes and max-change values when taking any step. + This will be the product of the top_weight ("tw") from + the key in the egs, with the value of the + --unadapted-top-weight option. If this is zero, we + won't be training the top model on this eg at all. @param [in] deriv_weights Weights to be applied to the derivatives for the corresponding frames of the output (order is: first frame for all sequences; second frame for @@ -413,34 +434,54 @@ class NnetChainaTopTrainer { and the order is: all sequences' frame 0; then all sequences' frame 1; and so on. @param [out] input_deriv Derivative w.r.t. the input features; this will - be added to, if it is not NULL. This function - applies the scale opts_.unadapted_backprop_weight - after adding this derivative to it. (The scale - opts_.unadapted_backprop_scale is implicitly - included already as we already scaled the objf - derivatives). + be set via Swap(), if it is not NULL. Any weight to + (be applied e.g. opts_.unadapted_bottom_weight), + should be applied by the caller. @return Returns true if the training went through successfully (it should very rarely return false, e.g. if a NaN was generated). */ bool TrainUnadapted(const CuMatrixBase &input, const NnetComputation &computation, const chain::Supervision &supervision, + BaseFloat model_training_scale, const CuVectorBase &deriv_weights, Posterior *posterior, - CuMatrixBase *input_deriv); + CuMatrix *input_deriv); /** Converts the format of the posterior from how it is at the output of the network to how it is at the input (i.e. in the embedding space). Basically, this will consist of padding with empty posteriors for the "context frames", and possibly upsampling the posteriors (by just repeating - each one for, say, 3 frames, if top_subsampling_factor == 3). + each one for, say, 3 frames, if top_subsampling_factor == 3). The + rule we'll use is: copy the posterior from the output frame that + is closest in numbering, rounding down in case of ties (i.e., for even + subsampling factor). + + @param [in] post_at_output The posterior that needs to be padded, + consisting of 'num_sequences' sequences, each with 't' + values starting at zero, at multiples of + 'top_subsampling_factor', and with number of 't' values + determined by: num_frames_out = post_at_output.size() / + num_sequences. The 't' has the larger stride than the + minibatch index 'n', so it's: frame t=0 of all sequences, + then frame t=1*top_subsampling_factor of all sequences, + and so on. + @param [in] num_sequences The number of sequences/chunks + @param [in] first_input_t The first 't' value at the input, for which + we need a posterior for (note: negative 't' values will + get zero posterior). Implicitly, first_output_t = 0. + The number of input frames is worked out as + post_at_input->size() / num_sequences; the 't' values + at the input are assumed to be consecutive. + @param [in] top_subsampling_factor The number of frames with which + 't' values at the output are separated. + @param [out] post_at_input The posterior after padding and possibly + subsampling. Should have the correct size but its + elements are expected to be empty at entry. Like + post_at_output, the 't' has the larger stride than + the minibatch-index 'n'. - The number of frames per sequence at the output will equal - post_at_output.size() / num_sequences, and the number of frames per - sequence at the input will equal post_at_inptu->size() / num_sequences - (note: this means 'post_at_input is expected to be appropriately sized - when this function is called). */ void ConvertPosterior(const Posterior &post_at_output, int32 num_sequences, @@ -450,7 +491,6 @@ class NnetChainaTopTrainer { /** Does the adapted pass of training. - @param [in] input The adapted input features. @param [in] computation The adapted version of the computation (this one uses the outputs "output" and "output-xent" instead of @@ -458,6 +498,12 @@ class NnetChainaTopTrainer { @param [in] supervision The chain supervision object, containing information derived from the numerator lattices. + @param [in] model_training_scale A scale we'll apply to the parameter changes + and max-change values when taking any step. This will be + referred to elsewhere as top_weight, or "tw" when present in + keys of egs in scp files; we'll have a separately specifiable + weight for the bottom nnet. If this is zero, we won't be training + the top model on this eg at all. @param [in] deriv_weights Weights to be applied to the derivatives for the corresponding frames of the output (order is: first frame for all sequences; second frame for @@ -465,20 +511,26 @@ class NnetChainaTopTrainer { egs. If this is the empty vector or --apply-deriv-weights=false, they won't be appplied. + @param [in] input The adapted input features. Provided as a non-const + pointer because it is consumed destructively (via Swap()). @param [in,out] input_deriv If non-NULL, the feature derivative w.r.t. the [speaker-adapted] input - features will be *added* to this location. + features will be written to this location. It's + done via Swap(), so it doesn't have to be correctly + sized on entry. @return */ - bool TrainAdapted(const CuMatrixBase &input, - const NnetComputation &computation, + bool TrainAdapted(const NnetComputation &computation, const chain::Supervision &supervision, + BaseFloat model_training_scale, const CuVectorBase &deriv_weights, - CuMatrixBase *input_deriv); + CuMatrix *input, + CuMatrix *input_deriv); - - void ProcessOutputs(const NnetChainExample &eg, - NnetComputer *computer); + // This function increments num_minibatches_processed_, but before + // doing so, if it notices that it is zero it makes certain calls + // to ConsolidateMemory() + void IncrementNumMinibatches(); std::string lang_name_; @@ -506,9 +558,9 @@ class NnetChainaTopTrainer { // speaker-dependent passes. int32 num_minibatches_processed_; - // stats for max-change (for speaker-independent model). + // stats for max-change (for speaker-independent phases of training) MaxChangeStats max_change_stats_si_; - // stats for max-change (for speaker-adapted model). + // stats for max-change (for speaker-adapted phases of training) MaxChangeStats max_change_stats_; }; @@ -553,25 +605,38 @@ class NnetChainaBottomTrainer { @param [in] frames_per_sequence_out The number of output frames per sequence. This is determined by the context of the top and bottom nnets and the "keep_embedding_context" config value. + @param [in] train_model True if we'll be training the bottom model + for this eg. If this is false, a backward pass will not be. + needed, and this function will return NULL @param [in] input The input features, most likely raw MFCC or filterbank features. A pointer, since it is consumed destructively (via 'swap'). - @param [out] output The output will be written to here. - @return Returns the NnetComputer object that we did the computation with; - the user should either pass this into Backward(), or delete it. + @param [out] output The output will be written to here. Does not have + to be correctly sized (we'll copy using Swap()). + @return Returns the NnetComputer object that we did the computation with, + if train_model == true (otherwise, returns NULL). + The user should either pass this into Backward(), or delete it. */ NnetComputer* Forward(int32 num_sequences, int32 first_input_t, int32 first_output_t, int32 frames_per_sequence_out, + bool train_model, CuMatrix *input, CuMatrix *output); /** - Does the backward pass, which will do model training. This will only be - called if the bottom nnet needs to be trained (otherwise the caller will - delete the 'computer' object. + Does the backward pass, which will do model training. This should only be + called if the bottom nnet needs to be trained. + @param [in] model_training_scale A scale we'll apply to the parameter changes + and max-change values when taking the step.. This will be + referred to elsewhere as bottom_weight, or "bw" when present in + keys of egs in scp files; we'll have a separately specifiable + weight for the top nnet. If this is zero, we won't be training + the top model on this eg at all (and we'll expect 'false' to + have been passed in for the 'train_model' arg on the corresponding + call to Forward()). @param [in] computer The computer object returned from the forward pass. This function takes ownership of it and will delete it when done with it. @@ -580,13 +645,17 @@ class NnetChainaBottomTrainer { by this function. */ - void Backward(NnetComputer *computer, + void Backward(BaseFloat model_training_scale, + NnetComputer *computer, CuMatrix *output_deriv); // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; + // Calls kaldi::nnet3::ConsolidateMemory() on nnet_ and delta_nnet_; we do + // this after the first minibatch of training, to reduce fragmentation. + void ConsolidateMemory(); ~NnetChainaBottomTrainer(); private: @@ -710,8 +779,18 @@ class NnetChainaTrainer { NnetChainaTrainer(const NnetChainaTrainingOptions &config, NnetChainaModels *models); - // train on one minibatch. - void Train(const NnetChainExample &eg); + /* Train on one minibatch. + @param [in] key The key the example had in the archive. This is + used to work out the language name. + @param [in] eg The example we are training on. It is expected + to have an input named 'input' (the features) and an + output named 'output' (containing the chain supervision + object). We'll make use of the chunks_per_spk member + of the NnetChainSupervision object, which is not used + outside the 'chaina' framework. + */ + void Train(const std::string &key, + const NnetChainExample &eg); // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; @@ -721,15 +800,27 @@ class NnetChainaTrainer { void PrintMaxChangeStats() const; ~NnetChainaTrainer(); + private: - void FindEgStructure + void GetContextInfo(const std::string &lang, + int32 *bottom_left_context, + int32 *bottom_right_context, + int32 *top_left_context, + int32 *top_right_context); + + + NnetChainaTopTrainer *GetTopTrainerForLang(const std::string &lang); const NnetChainaTrainingOptions &opts_; // pointer to object owned outside this class. NnetChainaModels *models_; + // left and right context of bottom model. + int32 bottom_left_context_; + int32 bottom_right_context_; + NnetChainaBottomTrainer bottom_trainer_; // map from language name (e.g. "default", "english", "french") to // the object that trains the corresponding 'top' nnet. diff --git a/src/nnet3a/nnet-chaina-utils.h b/src/nnet3a/nnet-chaina-utils.h index 0259459496f..4f028a4af0b 100644 --- a/src/nnet3a/nnet-chaina-utils.h +++ b/src/nnet3a/nnet-chaina-utils.h @@ -32,6 +32,149 @@ namespace kaldi { namespace nnet3 { +/** + This function works out certain structural information from an example for + 'chaina' (adapted chain) training. It assumes (and spot-checks) that the eg + has a single input, called 'input', with a regular structure where the 'n' + has the highest stride so it's: all frames for sequence 0; all frames for + sequence 1; and so on. It will raise an exception if the example does not, + in some respect, have the expected structure. + + @param [in] The example we are getting the structural information from + @param [out] num_sequences The number of sequences/chunks (actually just + the num_sequences in the eg.supervision object). + @param [out] chunks_per_spk The number of chunks per speaker + (just eg.chunks_per_spk) + @param [out] first_input_t The lowest numbered 't' value in the inputs. + Usually will be negative. This function requires the + input 't' values to be consecutive, and will crash + if they are not. + @param [out] num_input_frames The number of input frames. The last input + 't' value will be first_input_t + num_input_frames - 1. + @param [out] num_output_frames The number of output frames (which are + assumed to start from t=0 and to be spaced by + 'frame_subsampling_factor. + @param [out] frame_subsampling_factor The spacing on the output frames, + equal to the amount of subsampling that happens + between the input and the output (this will + later be factorized as: + frame_subsampling_factor = + bottom_subsampling_factor * top_subsampling_factor. + @param [out] eg_left_context Just as a convenience, this function outputs + the left-context in the example, which equals + first_output_t - first_input_t = -first_input_t. + @param [out] eg_right_context Again just as a convenience, this function + outputs the right-context of the example, which + equals last_input_t - last_output_t = + (first_input_t + num_input_frames - 1) - + (first_output_t + num_output_frames - 1) * frame_subsampling_factor + (note: first_output_t is zero). +*/ +void FindChainaExampleStructure(const NnetChainExample &eg, + int32 *num_sequences, + int32 *chunks_per_spk, + int32 *first_input_t, + int32 *num_input_frames, + int32 *num_output_frames, + int32 *frame_subsampling_factor, + int32 *eg_left_context, + int32 *eg_right_context); + +/** + This function computes some info about which frames we need to compute the + embeddings for (i.e. which frames we need to request at the output of the + bottom nnet). It will print a warning and return false if the egs had + insufficient context to compute what is requested. + + @param [in] first_input_t The first 't' value for the input that + is provided to the bottom nnet. + @param [in] num_input_frames The number of input frames provided to + the bottom nnet; these are assumed to be consecutive. + @param [in] num_output_frames The number of output frames that we + need to compute the output for (this will be + the sequence_length in the chain supervision object). + @param [in] frame_subsampling_factor The factor by which we + subsample to get the final output (includes subsampling + in both the bottom and top nnet). + @param [in] bottom_subsampling_factor The amount of subsampling + for getting the embeddings (i.e. the embeddings + are obtained at t = multiples of this value.) + Must be >0 and divide frame_subsampling_factor. + This must be provided and can't be worked out from + the nnets, because the top nnet uses a different frame + numbering-- i.e. we divide the 't' values by + 'bottom_subsampling_factor' so that the inputs to the + top nnet are consecutive. This will make it easier + to apply the top nnet separately from binaries. + @param [in] bottom_left_context The num-frames of left-context that the + bottom nnet requires + @param [in] bottom_right_context The num-frames of right-context that the + bottom nnet requires + @param [in] top_left_context The num-frames of left-context that the + top nnet requires. Note: this is *after* dividing the + 't' values by bottom_subsampling_factor, so the number + top_left_context * bottom_subsampling_factor can be used + to compute the total left-context that we need to put in + the egs. + @param [in] top_right_context The num-frames of right-context that the + top nnet requires. See docs for top_left_context for more + info RE frame subsampling + @param [in] keep_embedding_context True if we want to compute as + many frames of the embedding as we can given the amount + of available left context in the input. This will be + usually be set to true if the top nnet is recurrent or + can otherwise consume extra context. + @param [out] first_embedding_t First 't' value of the embedding. CAUTION: + this is in the original frame numbering (the one we use + for the bottom nnet), and will be a multiple of + 'bottom_subsampling_factor'. You need to divide by + 'bottom_subsampling_factor' to get the 't' value used + at the input of the top nnet. + @param [out] num_embedding_frames The number of embedding frames that + we are computing. + @return Returns true if it could successfully compute the output, + and false if it could not because of insufficient input + context. + */ +bool ComputeEmbeddingTimes(int32 first_input_t, + int32 num_input_frames, + int32 num_output_frames, + int32 frame_subsampling_factor, + int32 bottom_subsampling_factor, + int32 bottom_left_context, + int32 bottom_right_context, + int32 top_left_context, + int32 top_right_context, + bool keep_embedding_context, + int32 *first_embedding_t, + int32 *num_embedding_frames); + + +/** + This function parses a string value from a 'url-like' string (which is probably actually + a key value from an scp file). The general format this function handles is: + iiiiiiiiiiiiiiiiiii?aaa=xxxx&bbb=yyyy + where the only 'special characters' are '?' and '&'. This is modeled after a query + string in HTML. This function searches for a key name with the value 'key_name', + (e.g. 'aaa' or 'bbb' in the example), and if it exists, sets `value` to that value + (e.g. 'xxxx' or 'yyyy' in the example. If the string `string` has no '?' in it, + or the key name `key_name` is not present, this function returns false; otherwise, + it returns true and sets `value` to that value. + +*/ +bool ParseFromQueryString(const std::string &string, + const std::string &key_name, + std::string *value); + + +// This overloaded version of ParseFromQueryString()is for where a float value +// is required. If the key is present but cannot be turned into a float, it +// will raise an error. +bool ParseFromQueryString(const std::string &string, + const std::string &key, + BaseFloat *f); + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3a/notes.update b/src/nnet3a/notes.update index fc1a3492cca..1f0f9820b2a 100644 --- a/src/nnet3a/notes.update +++ b/src/nnet3a/notes.update @@ -81,6 +81,27 @@ prepare_egs.sh... === +nnet3-chain-merge-egs --keep-distinct + + + ?aaa=xxxx&bbb=yyyy + + + + +``` +This copies nnet3+chain training examples from input to output, merging them +into composite examples. The --minibatch-size option controls how many egs +are merged into a single output eg. + +Usage: nnet3-chain-merge-egs [options] +e.g. +nnet3-chain-merge-egs --minibatch-size=128 ark:1.cegs ark:- | nnet3-chain-train-simple ... +See also nnet3-chain-copy-egs +``` + + + BUT, we don't want to do this on minibatches @@ -97,7 +118,14 @@ prepare_egs.sh... We'll use that info, together with the speaker-id and utt2uniq information, to merge chunks together into groups (preferably by utterance; if not, by speaker) - in process_egs.sh (the merging will be done in python). + in process_egs.sh (the choice of which egs to merge will be done in python). + +====== + +====== + later, when + + The merging script will decide the key for the merged egs. process_egs.sh will dump these as archives *and* scp files, but they will now be in groups of chunks_per_spk (e.g. 4). The language name will be added as the From c049d11c0d0a5bd5503fa9d5b631f7bc75f2aba9 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 28 Dec 2018 12:22:29 -0800 Subject: [PATCH 44/87] [src] Updates to chaina training code. --- src/nnet3a/nnet-chaina-training.cc | 69 +++++++++++++++++++++++------- src/nnet3a/nnet-chaina-training.h | 60 ++++++++++++++++++++++++-- 2 files changed, 111 insertions(+), 18 deletions(-) diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 79940238e05..ec04a05b9cb 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -26,21 +26,32 @@ namespace nnet3 { NnetChainaModels::NnetChainaModels( bool zero_component_stats, + bool bottom_model_test_mode, + bool top_model_test_mode, const std::string &model_dir, const std::string &den_fst_dir, const std::string &transform_dir): zero_component_stats_(zero_component_stats), + bottom_model_test_mode_(bottom_model_test_mode), + top_model_test_mode_(top_model_test_mode), model_dir_(model_dir), den_fst_dir_(den_fst_dir), transform_dir_(transform_dir) { std::string bottom_nnet_name; // model_dir/bottom.raw GetPathname(model_dir, "bottom", "raw", &bottom_nnet_name); ReadKaldiObject(bottom_nnet_name, &bottom_nnet_); - if (zero_component_stats_) + if (zero_component_stats_ && !bottom_model_test_mode_) ZeroComponentStats(&bottom_nnet_); ComputeSimpleNnetContext(bottom_nnet_, &bottom_nnet_left_context_, &bottom_nnet_right_context_); + if (bottom_model_test_mode_) { + SetBatchnormTestMode(true, &bottom_nnet_); + SetDropoutTestMode(true, &bottom_nnet_); + // The following is for efficiency in evaluating the bottom nnet, + // it may combine certain component types. + CollapseModel(CollapseModelConfig(), &bottom_nnet_); + } } void NnetChainaModels::GetPathname(const std::string &dir, @@ -80,9 +91,17 @@ NnetChainaModels::LanguageInfo *NnetChainaModels::GetInfoForLang( Input ki(model_filename, &binary); info->trans_model.Read(ki.Stream(), binary); info->am_nnet.Read(ki.Stream(), binary); - if (zero_component_stats_) { + if (zero_component_stats_ && !top_model_test_mode_) { ZeroComponentStats(&(info->am_nnet.GetNnet())); } + if (top_model_test_mode_) { + Nnet &nnet = info->am_nnet.GetNnet(); + SetBatchnormTestMode(true, &nnet); + SetDropoutTestMode(true, &nnet); + // The following is for efficiency in evaluating the top nnet, + // it may combine certain component types. + CollapseModel(CollapseModelConfig(), &bottom_nnet_); + } } ReadFstKaldi(den_fst_filename, &(info->den_fst)); { @@ -137,11 +156,15 @@ NnetChainaModels::GetTransformForLang( void NnetChainaModels::WriteRawModels(const std::string &model_out_dir, bool binary, int32 job_id) { - std::string bottom_model_name; - GetPathname(model_out_dir, "bottom", job_id, "raw", &bottom_model_name); - WriteKaldiObject(bottom_nnet_, bottom_model_name, binary); + if (!bottom_model_test_mode_) { + std::string bottom_model_name; + GetPathname(model_out_dir, "bottom", job_id, "raw", &bottom_model_name); + WriteKaldiObject(bottom_nnet_, bottom_model_name, binary); + } + std::ostringstream lang_names_ss; for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { const std::string &lang_name = iter->first; + lang_names_ss << lang_name << " "; LanguageInfo *info = iter->second; { // we write it as a 'raw' model without the TransitionModel or @@ -152,6 +175,9 @@ void NnetChainaModels::WriteRawModels(const std::string &model_out_dir, WriteKaldiObject(info->am_nnet.GetNnet(), top_model_name, binary); } } + KALDI_LOG << "Wrote " << (bottom_model_test_mode_ ? "" : " bottom nnet and ") + << "nnets for languages " << lang_names_ss.str() << "to " + << model_out_dir; } @@ -241,7 +267,7 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( ComputationRequest request; request.need_model_derivative = s.train_model; - request.store_component_stats = true; + request.store_component_stats = !opts_.top_model_test_mode; request.inputs.resize(1); request.inputs[0].name = "input"; request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); @@ -262,7 +288,7 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( // the second frame of all sequences; and so on. request.outputs.resize(2); request.outputs[0].name = (s.adapted ? "output" : "output-si"); - request.outputs[0].has_deriv = true; + request.outputs[0].has_deriv = !opts_.top_model_test_mode; request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences); int32 t_stride_out = top_subsampling_factor; iter = request.outputs[0].indexes.begin(); @@ -274,7 +300,7 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( iter->t = t; } } - request.outputs[1].has_deriv = true; + request.outputs[1].has_deriv = !opts_.top_model_test_mode; request.outputs[1].name = (s.adapted ? "output-xent" : "output-xent-si"); request.outputs[1].indexes = request.outputs[0].indexes; std::shared_ptr computation = compiler_.Compile( @@ -309,6 +335,8 @@ bool NnetChainaTopTrainer::TrainUnadapted( const CuMatrixBase &output = computer.GetOutput("output-si"), &output_xent = computer.GetOutput("output-si-xent"); + // It's not optimal that we compute these derivatives even when we're not + // training, but the 'compute-prob' phase doesn't dominate. CuMatrix output_deriv(output.NumRows(), output.NumCols(), kUndefined), @@ -344,6 +372,7 @@ bool NnetChainaTopTrainer::TrainUnadapted( tot_weight, xent_objf); } + if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) { output_deriv.MulRowsVec(deriv_weights); output_xent_deriv.MulRowsVec(deriv_weights); @@ -772,11 +801,16 @@ NnetChainaBottomTrainer::NnetChainaBottomTrainer( std::shared_ptr NnetChainaBottomTrainer::GetComputation( const ComputationStructure &s) { - { + { // Check in the cache, in case we already handled this computation. auto iter = computation_map_.find(s); if (iter != computation_map_.end()) return iter->second; } + + if (opts_.bottom_model_test_mode) { + KALDI_ASSERT(!s.train_model); + } + int32 num_sequences = s.num_sequences, frames_per_sequence_in = s.frames_per_sequence_in, frames_per_sequence_out = s.frames_per_sequence_out, @@ -791,7 +825,10 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( ComputationRequest request; request.need_model_derivative = s.train_model; - request.store_component_stats = true; + // If the user supplied the option --train-bottom-model false, then we + // are using test-mode for the batch-norm on the bottom model, and we + // don't want to overwrite the batch-norm stats. + request.store_component_stats = !opts_.bottom_model_test_mode; request.inputs.resize(1); request.inputs[0].name = "input"; request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); @@ -895,10 +932,12 @@ void NnetChainaTrainer::Train(const std::string &key, ParseFromQueryString(key, "lang", &lang_name); ParseFromQueryString(key, "tw", &top_weight); ParseFromQueryString(key, "bw", &bottom_weight); - if (!(top_weight >= 0.0 && bottom_weight >= 0.0 && - (top_weight > 0.0 || bottom_weight > 0.0))) - KALDI_ERR << "Either the top or bottom weight " - "must be nonzero; neither can be negative: key=" << key; + KALDI_ASSERT(top_weight >= 0.0 && bottom_weight >= 0.0); + + if (opts_.bottom_model_test_mode) + bottom_weight = 0.0; + if (opts_.top_model_test_mode) + top_weight = 0.0; int32 num_sequences, chunks_per_spk, first_input_t, num_input_frames, num_output_frames, @@ -931,7 +970,7 @@ void NnetChainaTrainer::Train(const std::string &key, kUndefined), cu_embedding; eg_input.CopyToMat(&cu_input); - bool train_bottom_nnet = (bottom_weight != 1.0); + bool train_bottom_nnet = bottom_weight != 1.0; KALDI_ASSERT(cu_input.NumRows() == num_input_frames * num_sequences); NnetComputer *computer = bottom_trainer_.Forward( diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index b077d3b3ecd..9ec126a167a 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -42,13 +42,17 @@ struct NnetChainaTrainingOptions { BaseFloat unadapted_bottom_weight; int32 bottom_subsampling_factor; bool keep_embedding_context; + bool bottom_model_test_mode; + bool top_model_test_mode; NnetChainaTrainingOptions(): apply_deriv_weights(true), unadapted_top_weight(1.0), unadapted_bottom_weight(0.5), bottom_subsampling_factor(1), - keep_embedding_context(true) { } + keep_embedding_context(true), + bottom_model_test_mode(false), + top_model_test_mode(false) { } void Register(OptionsItf *opts) { nnet_config.Register(opts); @@ -83,6 +87,20 @@ struct NnetChainaTrainingOptions { "optional dependencies (for example: if it uses " "StatisticsExtractionComponent, IfDefined(), Failover(), " "etc.)."); + opts->Register("bottom-model-test-mode", &bottom_model_test_mode, + "Set this to true to disable training of the bottom nnet, " + "to use test-mode for any batch-norm or dropout" + "components in it, and to disable the accumulation of " + "statistics for the bottom model (to keep the batchnorm " + "stats frozen). Setting this to false can be used to " + "evaluate train or valid probs."); + opts->Register("top-model-test-mode", &top_model_test_mode, + "Set this to true to disable training of the top nnet, " + "to use test-mode for any batch-norm or dropout" + "components in it, and to disable the accumulation of " + "statistics for the top model (to keep the batchnorm " + "stats frozen). Setting this to false can be used to " + "evaluate train or valid probs."); } void Check() { KALDI_ASSERT(unadapted_top_weight > 0.0 && @@ -124,8 +142,40 @@ class NnetChainaModels { The models and denominator FSTs will only be read when they are actually required, so languages that are not used by a particular job (e.g. because they were not represented in the egs) will not actually be read. + + + @param [in] zero_components stats... The --zero-component-stats option + from NnetChainaTrainingOptions::nnet_config. Note: if + bottom_model_test_mode is true, we won't zero the stats on + the bottom model regardless of this value. + @param [in] bottom_model_test_mode If true, the bottom model will not be + trained (should be set to the same-named option from + NnetChainaTrainingOptions). It's needed to know + whether to write the bottom model in WriteRawModels(), + and whether to zero the component stats, set batch-norm + test mode, and collapse the model. + @param [in] top_model_test_mode If true, the top model will not be + trained (should be set to the same-named option from + NnetChainaTrainingOptions). It's needed to know + whether to write the top models in WriteRawModels(), + and whether to zero the component stats, set batch-norm + test mode, and collapse the model. + @param [in] model_dir Directory where we'll find bottom.raw, and + .mdl for each language present in the egs + (the will be worked out from the key name from + "...?lang=xxx" in the key when reading the egs, + see ParseFromQueryString() in nnet-chain-utils.h. + @param [in] den_fst_ir Directory where we'll find the denominator + FST .fst for each language present in + the egs. + @param [in] transform_dir Directory where we'll find the + transforms (of type DifferentiableTransformItf), + as files .ada for each language present + in the egs. */ NnetChainaModels(bool zero_component_stats, + bool bottom_model_test_mode, + bool top_model_test_mode, const std::string &model_dir, const std::string &den_fst_dir, const std::string &transform_dir); @@ -202,6 +252,10 @@ class NnetChainaModels { // True if we are going to call ZeroComponentStats() on models when they are // read. bool zero_component_stats_; + // A copy of the "bottom-model-test-mode" option in NnetChainaTrainingOptions. + bool bottom_model_test_mode_; + // A copy of the "top-model-test-mode" option in NnetChainaTrainingOptions. + bool top_model_test_mode_; // Directory where models are located. std::string model_dir_; // Directory where denominator FSTs are located. @@ -232,8 +286,8 @@ class NnetChainaTopTrainer { @param [in] lang_name The name of the language this corresponds to (needed for diagnostics). E.g. "default", "english". - @param [in] config Options class - @param [in] den_fst The denominator FST for this language + @param [in] config Options class + @param [in] den_fst The denominator FST for this language @param [in] transform The transform object which will be used to produce adapted features after the first pass of training. @param [in,out] nnet The neural net we are training. Expected to have From e9b1e8b467600500fbb9b703143a36ea4bcc5f0f Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sat, 29 Dec 2018 16:12:53 -0500 Subject: [PATCH 45/87] Small change to make sure numerator post is always computed if requested --- src/chain/chain-training.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 5cf9313fccb..3734d6a3008 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -27,7 +27,9 @@ namespace kaldi { namespace chain { - +/* This function converts the pdf occupation probabilties (e.g. computed + using Forward-Backward on the numerator graph) to posteriors. + */ void ConvertDerivsToPosterior(const CuMatrixBase &numerator_derivs, Posterior *numerator_post) { numerator_post->resize(numerator_derivs.NumRows()); @@ -165,6 +167,19 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, CuMatrixBase *nnet_output_deriv, CuMatrix *xent_output_deriv, Posterior *numerator_post) { + if (!nnet_output_deriv && !xent_output_deriv && numerator_post) { + // To compute the posteriors, we will need to compute the numerator + // derivatives first (and to compute them, at least one of the *_deriv + // arguments should be non-NULL). + CuMatrix xent_deriv; + // Rcurse + ComputeChainObjfAndDeriv(opts, den_graph, supervision, + nnet_output, objf, l2_term, + weight, nnet_output_deriv, + &xent_deriv, numerator_post); + return; + } + if (!supervision.e2e_fsts.empty()) { ComputeChainObjfAndDerivE2e(opts, den_graph, supervision, nnet_output, objf, l2_term, From 66b9c96f4b01095447c4e369a6ecc165f0135458 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sat, 29 Dec 2018 16:18:36 -0500 Subject: [PATCH 46/87] Typo fix --- src/chain/chain-training.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 3734d6a3008..6ac96e85593 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -172,7 +172,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, // derivatives first (and to compute them, at least one of the *_deriv // arguments should be non-NULL). CuMatrix xent_deriv; - // Rcurse + // Recurse ComputeChainObjfAndDeriv(opts, den_graph, supervision, nnet_output, objf, l2_term, weight, nnet_output_deriv, From 67ba16248c93aa920c3724d41f3ba8216aec042e Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 30 Dec 2018 09:37:05 -0500 Subject: [PATCH 47/87] Add --long-key option for nnet3-chain-get-egs --- src/chainbin/nnet3-chain-get-egs.cc | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 1032b7e2125..85e26bc7f30 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -95,7 +95,7 @@ static bool ProcessFile(const TransitionModel *trans_mdl, const VectorBase *deriv_weights, int32 supervision_length_tolerance, const std::string &utt_id, - bool compress, + bool compress, bool long_key, UtteranceSplitter *utt_splitter, NnetChainExampleWriter *example_writer) { KALDI_ASSERT(supervision.num_sequences == 1); @@ -228,9 +228,14 @@ static bool ProcessFile(const TransitionModel *trans_mdl, nnet_chain_eg.Compress(); std::ostringstream os; - os << utt_id << "-" << chunk.first_frame; + if (long_key) + os << utt_id + << "-" << chunk.first_frame << "-" << chunk.left_context + << "-" << chunk.num_frames << "-" << chunk.right_context << "-v1"; + else // key is - + os << utt_id << "-" << chunk.first_frame; - std::string key = os.str(); // key is - + std::string key = os.str(); example_writer->Write(key, nnet_chain_eg); } @@ -265,7 +270,7 @@ int main(int argc, char *argv[]) { "Note: the --frame-subsampling-factor option must be the same as given to\n" "chain-get-supervision.\n"; - bool compress = true; + bool compress = true, long_key = false; int32 length_tolerance = 100, online_ivector_period = 1, supervision_length_tolerance = 1; @@ -311,6 +316,8 @@ int main(int argc, char *argv[]) { "Filename of transition model to read; should only be supplied " "if you want 'unconstrained' egs, and if you supplied " "--convert-to-pdfs=false to chain-get-supervision."); + po.Register("long-key", &long_key, "If true, a long format will be used " + "for the key, which encodes context info, etc."); eg_config.Register(&po); @@ -426,7 +433,7 @@ int main(int argc, char *argv[]) { if (!ProcessFile(trans_mdl_ptr, normalization_fst, feats, online_ivector_feats, online_ivector_period, supervision, deriv_weights, supervision_length_tolerance, - key, compress, + key, compress, long_key, &utt_splitter, &example_writer)) num_err++; } From 902a5abed1846a13649ea696e6f6eceabbd180e8 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 1 Jan 2019 13:37:51 -0500 Subject: [PATCH 48/87] Compute numerator posteriors in forward-backward --- src/chain/chain-generic-numerator.cc | 30 +++++++++++++++++++++++- src/chain/chain-generic-numerator.h | 4 +++- src/chain/chain-numerator.cc | 29 +++++++++++++++++++++++- src/chain/chain-numerator.h | 5 ++-- src/chain/chain-training.cc | 34 +++++----------------------- 5 files changed, 69 insertions(+), 33 deletions(-) diff --git a/src/chain/chain-generic-numerator.cc b/src/chain/chain-generic-numerator.cc index d3a114242c2..7453568913a 100644 --- a/src/chain/chain-generic-numerator.cc +++ b/src/chain/chain-generic-numerator.cc @@ -209,9 +209,33 @@ BaseFloat GenericNumeratorComputation::AlphaRemainingFrames(int seq, return log_prob_product + log_scale_product; } +/* This function converts the pdf occupation probabilties (computed + using Forward-Backward on the numerator graph) to posteriors. + "derivs" is frames_per_sequence by pdf_index_size (i.e., indices.size()) +*/ +static void ConvertDerivsToPosterior(const MatrixBase &derivs, + const std::vector &indices, + int32 pdf_stride, + int32 frames_per_sequence, + int32 num_sequences, + Posterior *post) { + post->resize(frames_per_sequence * num_sequences); + for (size_t t = 0; t < derivs.NumRows(); ++t) + for (int32 n = 0; n < derivs.NumCols(); ++n) { + BaseFloat posterior = Exp(derivs(t, n)); + if (posterior != 0.0) { + int32 seq = indices[n] / pdf_stride; + int32 pdfid = indices[n] % pdf_stride; + (*post)[t * num_sequences + seq].push_back( + std::make_pair(pdfid, posterior)); + } + } +} + bool GenericNumeratorComputation::ForwardBackward( BaseFloat *total_loglike, - CuMatrixBase *nnet_output_deriv) { + CuMatrixBase *nnet_output_deriv, + Posterior *numerator_post) { KALDI_ASSERT(total_loglike != NULL); KALDI_ASSERT(nnet_output_deriv != NULL); KALDI_ASSERT(nnet_output_deriv->NumCols() == nnet_output_.NumCols()); @@ -243,6 +267,10 @@ bool GenericNumeratorComputation::ForwardBackward( if (GetVerboseLevel() >= 1) ok = ok && CheckValues(seq, probs, alpha, beta, derivs); } + if (numerator_post) + ConvertDerivsToPosterior(derivs, index_to_pdf_, nnet_output_.Stride(), + supervision_.frames_per_sequence, + num_sequences, numerator_post); // Transfer and add the derivatives to the values in the matrix AddSpecificPdfsIndirect(&derivs, index_to_pdf_, nnet_output_deriv); *total_loglike = partial_loglike; diff --git a/src/chain/chain-generic-numerator.h b/src/chain/chain-generic-numerator.h index fc5e00b2c63..2becfd56051 100644 --- a/src/chain/chain-generic-numerator.h +++ b/src/chain/chain-generic-numerator.h @@ -33,6 +33,7 @@ #include "lat/kaldi-lattice.h" #include "matrix/kaldi-matrix.h" #include "hmm/transition-model.h" +#include "hmm/posterior.h" #include "chain/chain-supervision.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-array.h" @@ -121,7 +122,8 @@ class GenericNumeratorComputation { // nnet output w.r.t. the (log-prob times supervision_.weight times // deriv_weight) to 'nnet_output_deriv'. bool ForwardBackward(BaseFloat *total_loglike, - CuMatrixBase *nnet_output_deriv); + CuMatrixBase *nnet_output_deriv, + Posterior *numerator_post = NULL); BaseFloat ComputeObjf(); private: diff --git a/src/chain/chain-numerator.cc b/src/chain/chain-numerator.cc index 139d28bdd77..caba37023a7 100644 --- a/src/chain/chain-numerator.cc +++ b/src/chain/chain-numerator.cc @@ -146,9 +146,29 @@ BaseFloat NumeratorComputation::Forward() { return tot_log_prob_ * supervision_.weight; } +/* This function converts the pdf occupation probabilties (computed + using Forward-Backward on the numerator graph) to posteriors. +*/ +static void ConvertDerivsToPosterior( + const Vector &derivs, + const std::vector &nnet_output_indexes, + int32 nnet_output_rows, + Posterior *post) { + post->resize(nnet_output_rows); + for (size_t i = 0; i < nnet_output_indexes.size(); ++i) { + if (derivs(i) != 0.0) { + int32 row = nnet_output_indexes[i].first; + int32 pdfid = nnet_output_indexes[i].second; + (*post)[row].push_back( + std::make_pair(pdfid, derivs(i))); + } + } +} + void NumeratorComputation::Backward( - CuMatrixBase *nnet_output_deriv) { + CuMatrixBase *nnet_output_deriv, + Posterior *numerator_post) { const fst::StdVectorFst &fst = supervision_.fst; int32 num_states = fst.NumStates(); log_beta_.Resize(num_states, kUndefined); @@ -201,6 +221,13 @@ void NumeratorComputation::Backward( KALDI_WARN << "Disagreement in forward/backward log-probs: " << tot_log_prob_backward << " vs. " << tot_log_prob_; + if (numerator_post) { + std::vector nnet_output_indexes_cpu; + nnet_output_indexes_.CopyToVec(&nnet_output_indexes_cpu); + ConvertDerivsToPosterior(nnet_logprob_derivs_, nnet_output_indexes_cpu, + nnet_output_.NumRows(), numerator_post); + } + // copy this data to GPU. CuVector nnet_logprob_deriv_cuda; nnet_logprob_deriv_cuda.Swap(&nnet_logprob_derivs_); diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h index 15cb31e0571..63cb186fde8 100644 --- a/src/chain/chain-numerator.h +++ b/src/chain/chain-numerator.h @@ -32,6 +32,7 @@ #include "lat/kaldi-lattice.h" #include "matrix/kaldi-matrix.h" #include "hmm/transition-model.h" +#include "hmm/posterior.h" #include "chain/chain-supervision.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-array.h" @@ -78,7 +79,8 @@ class NumeratorComputation { // Does the backward computation and (efficiently) adds the derivative of the // nnet output w.r.t. the (log-prob times supervision_.weight times // deriv_weight) to 'nnet_output_deriv'. - void Backward(CuMatrixBase *nnet_output_deriv); + void Backward(CuMatrixBase *nnet_output_deriv, + Posterior *numerator_post = NULL); private: @@ -143,4 +145,3 @@ class NumeratorComputation { } // namespace kaldi #endif // KALDI_CHAIN_CHAIN_NUMERATOR_H_ - diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 6ac96e85593..c4637c9cb86 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -27,20 +27,6 @@ namespace kaldi { namespace chain { -/* This function converts the pdf occupation probabilties (e.g. computed - using Forward-Backward on the numerator graph) to posteriors. - */ -void ConvertDerivsToPosterior(const CuMatrixBase &numerator_derivs, - Posterior *numerator_post) { - numerator_post->resize(numerator_derivs.NumRows()); - for (size_t i = 0; i < numerator_derivs.NumRows(); ++i) { - const auto &row = numerator_derivs.Row(i); - for (size_t pdfid = 0; pdfid < row.Dim(); ++pdfid) - if (row(pdfid) != 0.0) - (*numerator_post)[i].push_back(std::make_pair(pdfid, row(pdfid))); - } -} - void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, @@ -91,12 +77,14 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, // the numerator object, as well as the returned logprob. if (xent_output_deriv) { numerator_ok = numerator.ForwardBackward(&num_logprob_weighted, - xent_output_deriv); + xent_output_deriv, + numerator_post); if (numerator_ok && nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { numerator_ok = numerator.ForwardBackward(&num_logprob_weighted, - nnet_output_deriv); + nnet_output_deriv, + numerator_post); } else { num_logprob_weighted = numerator.ComputeObjf(); } @@ -106,11 +94,6 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, numerator_ok = numerator_ok && (num_logprob_weighted - num_logprob_weighted == 0); - if (numerator_post && (xent_output_deriv || nnet_output_deriv)) { - ConvertDerivsToPosterior(nnet_output_deriv ? *nnet_output_deriv : - *xent_output_deriv, numerator_post); - } - *objf = num_logprob_weighted - den_logprob_weighted; if (!((*objf) - (*objf) == 0) || !denominator_ok || !numerator_ok) { // inf or NaN detected, or denominator computation returned false. @@ -224,19 +207,14 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, num_logprob_weighted = numerator.Forward(); if (xent_output_deriv) { - numerator.Backward(xent_output_deriv); + numerator.Backward(xent_output_deriv, numerator_post); if (nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { - numerator.Backward(nnet_output_deriv); + numerator.Backward(nnet_output_deriv, numerator_post); } } - if (numerator_post && (xent_output_deriv || nnet_output_deriv)) { - ConvertDerivsToPosterior(nnet_output_deriv ? *nnet_output_deriv : - *xent_output_deriv, numerator_post); - } - *objf = num_logprob_weighted - den_logprob_weighted; *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; From d23dfe2964bd30631a66760636c0215158091012 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 1 Jan 2019 16:26:06 -0500 Subject: [PATCH 49/87] Add context extension capability to nnet3-chain-copy-egs with merged-eg support; truncate still only supports single egs. --- src/chainbin/nnet3-chain-copy-egs.cc | 217 +++++++++++++++++---------- 1 file changed, 138 insertions(+), 79 deletions(-) diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 0117fe2200f..abf039279f4 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -91,7 +91,7 @@ void FilterExample(int32 min_input_t, if (io.name == "input") { min_t = min_input_t; max_t = max_input_t; - + const std::vector &indexes_in = io.indexes; std::vector indexes_out; indexes_out.reserve(indexes_in.size()); @@ -124,22 +124,88 @@ void FilterExample(int32 min_input_t, } } +/** + This function extends the left/right input context by adding + necessary indexes (and feature rows) for the NnetIo named "input". + First/last frame will be duplicated to add left/right context respectively. + */ +void ExtendContext(NnetChainExample *eg, + int32 n_stride, + int32 min_input_t, + int32 max_input_t, + int32 extend_left_context, + int32 extend_right_context) { + // process the inputs + for (size_t i = 0; i < eg->inputs.size(); i++) { + NnetIo &io = eg->inputs[i]; + if (io.name == "input") { + // Assume t_stride = 1 (since it's input) + std::vector &indexes = io.indexes; + KALDI_ASSERT(indexes.size() < 2 || indexes[0].t + 1 == indexes[1].t); + // The input indexes are not re-ordered. The order is: all frames of first + // sequence, then all frames of 2nd seq, ... + indexes.resize(indexes.size() + n_stride * (extend_left_context + + extend_right_context)); + KALDI_ASSERT(indexes.size() == n_stride * + (max_input_t - min_input_t + 1)); + + for (int32 n = 0, i = 0; n < n_stride; ++n) { + for (int32 t = min_input_t; t <= max_input_t; ++t, ++i) { + indexes[i].t = t; + indexes[i].n = n; + } + } + + Matrix features_out(indexes.size(), io.features.NumCols()); + Matrix features_in; + io.features.GetMatrix(&features_in); -/** Returns true if the "eg" contains just a single example, meaning - that all the "n" values in the indexes are zero, and the example - has NnetIo members named both "input" and "output" + int32 original_min_t = min_input_t + extend_left_context, + original_max_t = max_input_t - extend_right_context; + // For each "n", duplicate the first frame to extend left context, + // then copy the features, then duplicate the last frame to extend right + // context. + int32 i_in = 0, i_out = 0; + for (int32 n = 0; n < n_stride; ++n) { + // Duplicate frame i_in, "extend_left_context" times + for (int32 j = 0; j < extend_left_context; ++j, ++i_out) + features_out.Row(i_out).CopyFromVec(features_in.Row(i_in)); + + for (int32 t = original_min_t; t <= original_max_t; ++t, ++i_out, ++i_in) + features_out.Row(i_out).CopyFromVec(features_in.Row(i_in)); + + // Duplicate frame i_in - 1, "extend_right_context" times + for (int32 j = 0; j < extend_right_context; ++j, ++i_out) + features_out.Row(i_out).CopyFromVec(features_in.Row(i_in - 1)); + + } + KALDI_ASSERT(i_in == features_in.NumRows()); + KALDI_ASSERT(i_out == features_out.NumRows()); + + GeneralMatrix features_out_gmat; + features_out_gmat.SwapFullMatrix(&features_out); + io.features = features_out_gmat; + } + } +} + +/** Counts the number of single examples in "eg", which is equal to + the maximum "n" value in the indexes plus 1. + If the example does not have both "input" and "output" NnetIo members, + this function will exit the program with an error. Also computes the minimum and maximum "t" values in the "input" and "output" NnetIo members. */ -bool ContainsSingleExample(const NnetChainExample &eg, - int32 *min_input_t, - int32 *max_input_t, - int32 *min_output_t, - int32 *max_output_t) { +static int32 CountSingleExamples(const NnetChainExample &eg, + int32 *min_input_t, + int32 *max_input_t, + int32 *min_output_t, + int32 *max_output_t) { bool done_input = false, done_output = false; int32 num_indexes_input = eg.inputs.size(); int32 num_indexes_output = eg.outputs.size(); + int32 max_n = 0; for (int32 i = 0; i < num_indexes_input; i++) { const NnetIo &input = eg.inputs[i]; std::vector::const_iterator iter = input.indexes.begin(), @@ -152,23 +218,12 @@ bool ContainsSingleExample(const NnetChainExample &eg, int32 this_t = iter->t; min_t = std::min(min_t, this_t); max_t = std::max(max_t, this_t); - if (iter->n != 0) { - KALDI_WARN << "Example does not contain just a single example; " - << "too late to do frame selection or reduce context."; - return false; - } + if (iter->n > max_n) + max_n = iter->n; } done_input = true; *min_input_t = min_t; *max_input_t = max_t; - } else { - for (; iter != end; ++iter) { - if (iter->n != 0) { - KALDI_WARN << "Example does not contain just a single example; " - << "too late to do frame selection or reduce context."; - return false; - } - } } } @@ -184,34 +239,22 @@ bool ContainsSingleExample(const NnetChainExample &eg, int32 this_t = iter->t; min_t = std::min(min_t, this_t); max_t = std::max(max_t, this_t); - if (iter->n != 0) { - KALDI_WARN << "Example does not contain just a single example; " - << "too late to do frame selection or reduce context."; - return false; - } + // max_n must be the same for all io's (either input or output). + KALDI_ASSERT(iter->n <= max_n + && "Mismatched 'n' values. Partially merged?"); } done_output = true; *min_output_t = min_t; *max_output_t = max_t; - } else { - for (; iter != end; ++iter) { - if (iter->n != 0) { - KALDI_WARN << "Example does not contain just a single example; " - << "too late to do frame selection or reduce context."; - return false; - } - } } } - if (!done_input) { - KALDI_WARN << "Example does not have any input named 'input'"; - return false; - } - if (!done_output) { - KALDI_WARN << "Example does not have any output named 'output'"; - return false; - } - return true; + if (!done_input) + KALDI_ERR << "Example does not have any input named 'input'"; + + if (!done_output) + KALDI_ERR << "Example does not have any output named 'output'"; + + return max_n + 1; } // calculate the frame_subsampling_factor @@ -224,41 +267,33 @@ void CalculateFrameSubsamplingFactor(const NnetChainExample &eg, void ModifyChainExampleContext(int32 left_context, int32 right_context, const int32 frame_subsampling_factor, - NnetChainExample *eg) { - static bool warned_left = false, warned_right = false; + NnetChainExample *eg, + int32 *left_context_extension, + int32 *right_context_extension) { int32 min_input_t, max_input_t, - min_output_t, max_output_t; - if (!ContainsSingleExample(*eg, &min_input_t, &max_input_t, - &min_output_t, &max_output_t)) - KALDI_ERR << "Too late to perform frame selection/context reduction on " - << "these examples (already merged?)"; - if (left_context != -1) { + min_output_t, max_output_t; + *left_context_extension = 0; + *right_context_extension = 0; + // Example stride really means "n" stride (of the NnetIo's) + int32 example_stride = CountSingleExamples(*eg, &min_input_t, &max_input_t, + &min_output_t, &max_output_t); + if (left_context >= 0) { int32 observed_left_context = min_output_t - min_input_t; - if (!warned_left && observed_left_context < left_context) { - warned_left = true; - KALDI_WARN << "You requested --left-context=" << left_context - << ", but example only has left-context of " - << observed_left_context - << " (will warn only once; this may be harmless if " - "using any --*left-context-initial options)"; - } - min_input_t = std::max(min_input_t, min_output_t - left_context); + if (left_context > observed_left_context) // Extend + *left_context_extension = left_context - observed_left_context; + // Adjust min input t + min_input_t = min_output_t - left_context; } - if (right_context != -1) { + if (right_context >= 0) { int32 observed_right_context = max_input_t - max_output_t; - - if (right_context != -1) { - if (!warned_right && observed_right_context < right_context) { - warned_right = true; - KALDI_WARN << "You requested --right-context=" << right_context - << ", but example only has right-context of " - << observed_right_context - << " (will warn only once; this may be harmless if " - "using any --*right-context-final options."; - } - max_input_t = std::min(max_input_t, max_output_t + right_context); - } + if (right_context > observed_right_context) // Extend + *right_context_extension = right_context - observed_right_context; + max_input_t = max_output_t + right_context; } + + if (*left_context_extension > 0 || *right_context_extension > 0) + ExtendContext(eg, example_stride, min_input_t, max_input_t, + *left_context_extension, *right_context_extension); FilterExample(min_input_t, max_input_t, min_output_t, max_output_t, eg); @@ -348,6 +383,8 @@ int main(int argc, char *argv[]) { exclude_names.push_back(std::string("ivector")); int64 num_read = 0, num_written = 0, num_err = 0; + int64 num_left_context_extensions = 0, num_right_context_extensions = 0, + total_left_context_extension = 0, total_right_context_extension = 0; for (; !example_reader.Done(); example_reader.Next(), num_read++) { const std::string &key = example_reader.Key(); NnetChainExample &eg = example_reader.Value(); @@ -367,7 +404,7 @@ int main(int argc, char *argv[]) { weight = egs_weight_reader.Value(key); ScaleSupervisionWeight(weight, &eg); } - + if (!eg_output_name_rspecifier.empty()) { if (!output_name_reader.HasKey(key)) { KALDI_WARN << "No new output-name for example key " << key; @@ -377,13 +414,25 @@ int main(int argc, char *argv[]) { std::string new_output_name = output_name_reader.Value(key); RenameOutputs(new_output_name, &eg); } - + if (frame_shift != 0) ShiftChainExampleTimes(frame_shift, exclude_names, &eg); - if (left_context != -1 || right_context != -1) + if (left_context != -1 || right_context != -1) { + int32 right_context_extension, left_context_extension; ModifyChainExampleContext(left_context, right_context, - frame_subsampling_factor, &eg); - + frame_subsampling_factor, &eg, + &left_context_extension, + &right_context_extension); + if (left_context_extension > 0) { + num_left_context_extensions++; + total_left_context_extension += left_context_extension; + } + if (right_context_extension > 0) { + num_right_context_extensions++; + total_right_context_extension += right_context_extension; + } + } + for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; example_writers[index]->Write(key, eg); @@ -394,6 +443,16 @@ int main(int argc, char *argv[]) { delete example_writers[i]; KALDI_LOG << "Read " << num_read << " neural-network training examples, wrote " << num_written; + if (num_left_context_extensions > 0) + KALDI_LOG << "Left context was extended for " + << num_left_context_extensions << " examples, by an average of " + << (1.0 * total_left_context_extension / + num_left_context_extensions) << " frames"; + if (num_right_context_extensions > 0) + KALDI_LOG << "Right context was extended for " + << num_right_context_extensions << " examples, by an average of " + << (1.0 * total_right_context_extension + / num_right_context_extensions) << " frames."; return (num_written == 0 ? 1 : 0); } catch(const std::exception &e) { std::cerr << e.what() << '\n'; From e3db5191e4fcc5c5f4b5c0265907331660a0fd8b Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Wed, 2 Jan 2019 16:21:43 -0500 Subject: [PATCH 50/87] Support merged-egs when removing context --- src/chainbin/nnet3-chain-copy-egs.cc | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index abf039279f4..60fb70bd1c7 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -264,6 +264,16 @@ void CalculateFrameSubsamplingFactor(const NnetChainExample &eg, - eg.outputs[0].indexes[0].t; } +/* This function adds or removes context for the examples inside + "eg" (which can contain just a single example or it can be a + merged-eg which would contain more than one example). Addition or + removal of context is determined by comparing "left_context" with + the observed left context of "eg" (the same goes for right context): + if it's more, it'll extend input context by duplicating the first (or last, + for right context) frame. Otherwise, it'll remove the extra context from + both inputs and outputs in "eg". Note that when extending context, only the + "input" io will be modified (the "output" io will remain the same). + */ void ModifyChainExampleContext(int32 left_context, int32 right_context, const int32 frame_subsampling_factor, @@ -291,12 +301,12 @@ void ModifyChainExampleContext(int32 left_context, max_input_t = max_output_t + right_context; } - if (*left_context_extension > 0 || *right_context_extension > 0) - ExtendContext(eg, example_stride, min_input_t, max_input_t, - *left_context_extension, *right_context_extension); FilterExample(min_input_t, max_input_t, min_output_t, max_output_t, eg); + if (*left_context_extension > 0 || *right_context_extension > 0) + ExtendContext(eg, example_stride, min_input_t, max_input_t, + *left_context_extension, *right_context_extension); } // ModifyChainExampleContext } // namespace nnet3 From 04e6dc94737b0e20eb3b31a12210ffc3b7b625d4 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 30 Dec 2018 13:43:43 -0800 Subject: [PATCH 51/87] Incorporate code for keeping track of the tree map. --- src/adapt/differentiable-transform-itf.cc | 29 +++++++++ src/adapt/differentiable-transform-itf.h | 37 ++++++++++- src/nnet3a/nnet-chaina-training.cc | 74 ++++++++++++++++------ src/nnet3a/nnet-chaina-training.h | 36 +++++++---- src/nnet3a/notes.update | 77 +++++++++++++++++++++++ 5 files changed, 222 insertions(+), 31 deletions(-) diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc index aafb9abe86f..a8fb3cbed61 100644 --- a/src/adapt/differentiable-transform-itf.cc +++ b/src/adapt/differentiable-transform-itf.cc @@ -136,5 +136,34 @@ DifferentiableTransform* DifferentiableTransform::ReadFromConfig( return transform; } +int32 DifferentiableTransformMapped::NumPdfs() const { + if (pdf_map.empty()) + return transform->NumClasses(); + else + return static_cast(pdf_map.size()); +} + +void DifferentiableTransformMapped::Read(std::istream &is, bool binary) { + if (transform) + delete transform; + transform = DifferentiableTransform::ReadNew(is, binary); + ReadIntegerVector(is, binary, &pdf_map); + Check(); +} + +void DifferentiableTransformMapped::Write(std::ostream &os, bool binary) const { + Check(); + transform->Write(os, binary); + WriteIntegerVector(os, binary, pdf_map); +} + + +void DifferentiableTransformMapped::Check() const { + KALDI_ASSERT(transform != NULL && + (pdf_map.empty() || + *std::max_element(pdf_map.begin(), pdf_map.end()) + 1 == + transform->NumClasses())); +} + } // namespace differentiable_transform } // namespace kaldi diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index dc68471ae0f..8a8c4b5bd78 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -183,7 +183,7 @@ class DifferentiableTransform { same dimensions as 'input'. It does not have to be free of NaNs when you call this function. @return This function returns either NULL or an object of type - DifferentiableTransformItf*, which is expected to later be given + DifferentiableTransform*, which is expected to later be given to the function TrainingBackward(). It will store any information that needs to be remembered for the backward phase. @@ -372,6 +372,41 @@ class DifferentiableTransform { }; +/** + struct DifferentiableTransformMapped is just a holder of an object of type + DifferentiableTransform and a vector representing a map from + pdf-ids to classes. + + This map (if present) will be obtained from the binary build-tree-two-level, + and will map from tree leaves to a smaller number of classes (e.g. 200), so + that we can reasonably estimate the class means from a single minibatch + during training. The contents of 'pdf_map' should be in the range [0, + transform->NumClases() - 1]. + + */ +struct DifferentiableTransformMapped { + DifferentiableTransform *transform; + std::vector pdf_map; + + // This function returns pdf_map.size() if pdf_map is nonempty; otherwise + // it returns transform->NumClasses(). + int32 NumPdfs() const; + + void Read(std::istream &is, bool binary); + + void Write(std::ostream &os, bool binary) const; + + // Check that the dimensions are consistent, i.e. pdf_map.empty() or + // transform->NumClasses() == max-element-in-pdf_map + 1. + void Check() const; + + DifferentiableTransformMapped(): transform(NULL) {} + + ~DifferentiableTransformMapped() { delete transform; } + +}; + + } // namespace differentiable_transform } // namespace kaldi diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index ec04a05b9cb..369b28e6cf8 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -104,12 +104,7 @@ NnetChainaModels::LanguageInfo *NnetChainaModels::GetInfoForLang( } } ReadFstKaldi(den_fst_filename, &(info->den_fst)); - { - bool binary; - Input ki(transform_filename, &binary); - info->transform = differentiable_transform::DifferentiableTransform::ReadNew( - ki.Stream(), binary); - } + ReadKaldiObject(transform_filename, &(info->transform)); lang_info_[lang] = info; return info; } @@ -144,11 +139,11 @@ Nnet* NnetChainaModels::GetRawNnetForLang( return &(info->am_nnet.GetNnet()); } -differentiable_transform::DifferentiableTransform* +differentiable_transform::DifferentiableTransformMapped* NnetChainaModels::GetTransformForLang( const std::string &language_name) { LanguageInfo *info = GetInfoForLang(language_name); - return info->transform; + return &(info->transform); } @@ -190,7 +185,7 @@ NnetChainaTopTrainer::NnetChainaTopTrainer( const std::string &lang_name, const NnetChainaTrainingOptions &config, const fst::StdVectorFst &den_fst, - const differentiable_transform::DifferentiableTransform &transform, + const differentiable_transform::DifferentiableTransformMapped &transform, Nnet *nnet): lang_name_(lang_name), opts_(config), @@ -594,7 +589,10 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, Posterior post_padded(input.NumRows()); ConvertPosterior(post, num_sequences, first_input_t, - top_subsampling_factor, &post_padded); + top_subsampling_factor, + transform_.pdf_map, + transform_.transform->NumClasses(), + &post_padded); structure.adapted = true; std::shared_ptr computation_adapted = @@ -605,7 +603,7 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, adapted_input_deriv; using namespace differentiable_transform; - MinibatchInfoItf *minibatch_info = transform_.TrainingForward( + MinibatchInfoItf *minibatch_info = transform_.transform->TrainingForward( input, num_sequences, num_spk, post_padded, &adapted_input); success = TrainAdapted( @@ -620,18 +618,53 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, if (input_deriv == NULL) delete minibatch_info; else - transform_.TrainingBackward(input, adapted_input_deriv, - num_sequences, num_spk, post_padded, - minibatch_info, input_deriv); + transform_.transform->TrainingBackward(input, adapted_input_deriv, + num_sequences, num_spk, post_padded, + minibatch_info, input_deriv); return true; } +/** + This helper function for ConvertPosterior() converts from pdf-ids to + cluster-ids using the map provided in pdf_map, if it is nonempty. + If pdf_map is empty, it just copies the pairs over unchanged. + */ +static inline void ConvertPosteriorElement( + const std::vector &pdf_map, + int32 num_classes, + const std::vector > &post_elem_in, + std::vector > *post_elem_out) { + if (pdf_map.empty()) { + *post_elem_out = post_elem_in; + if (!post_elem_in.empty()) { + // We just check the first int32-- this is a spot-check that the + // pdf-ids are in the correct range. + KALDI_ASSERT(post_elem_in[0].first < num_classes); + } + } else { + int32 num_classes_in = pdf_map.size(); + size_t num_pairs = post_elem_in.size(); + post_elem_out->resize(num_pairs); + for (size_t i =0; i < num_pairs; i++) { + int32 pdf_id = post_elem_in[i].first; + BaseFloat weight = post_elem_in[i].second; + KALDI_ASSERT(pdf_id < num_classes_in); + int32 cluster_id = pdf_map[pdf_id]; + KALDI_ASSERT(cluster_id < num_classes); + (*post_elem_out)[i].first = cluster_id; + (*post_elem_out)[i].second = weight; + } + } +} + void NnetChainaTopTrainer::ConvertPosterior( const Posterior &post_at_output, int32 num_sequences, int32 first_input_t, int32 top_subsampling_factor, + const std::vector &pdf_map, + int32 num_classes, Posterior *post_at_input) { int32 output_post_size = post_at_output.size(), input_post_size = post_at_input->size(), @@ -657,11 +690,13 @@ void NnetChainaTopTrainer::ConvertPosterior( for (int32 n = 0; n < num_sequences; n++) { int32 input_index = num_sequences * (t_in - first_input_t) + n, output_index = num_sequences * ((t_out - first_output_t) / s) + n; - (*post_at_input)[input_index] = post_at_output[output_index]; + ConvertPosteriorElement(pdf_map, num_classes, + post_at_output[output_index], + &((*post_at_input)[input_index])); } } - // else just leave the posterior for this frame empty. This will happen for - // most of the frames that were added for left and right context. + // else just leave the input posterior for this frame empty. This will + // happen for most of the frames that were added for left and right context. } } @@ -865,7 +900,10 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( return computation; } - +void NnetChainaBottomTrainer::PrintTotalStats() const { + KALDI_LOG << "Max-change stats for bottom nnet:"; + max_change_stats_.Print(*nnet_); +} NnetChainaBottomTrainer::~NnetChainaBottomTrainer() { delete delta_nnet_; } diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index 9ec126a167a..a5b041e2848 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -200,7 +200,7 @@ class NnetChainaModels { // AmNnetSimple object returned by 'GetNnetForLang'. Nnet *GetRawNnetForLang(const std::string &language_name); - differentiable_transform::DifferentiableTransform *GetTransformForLang( + differentiable_transform::DifferentiableTransformMapped *GetTransformForLang( const std::string &language_name); // Writes the files @@ -237,11 +237,7 @@ class NnetChainaModels { // den_fst comes from /.fst fst::StdVectorFst den_fst; // transform comes from /.ada - differentiable_transform::DifferentiableTransform *transform; - LanguageInfo(): transform(NULL) { } - ~LanguageInfo() { - delete transform; - } + differentiable_transform::DifferentiableTransformMapped transform; }; @@ -260,7 +256,7 @@ class NnetChainaModels { std::string model_dir_; // Directory where denominator FSTs are located. std::string den_fst_dir_; - // Directory where transforms (type: DifferentiableTransform) are located. + // Directory where transforms (type: DifferentiableTransformMapped) are located. std::string transform_dir_; // This corresponds to /bottom.raw. @@ -302,7 +298,7 @@ class NnetChainaTopTrainer { const std::string &lang_name, const NnetChainaTrainingOptions &config, const fst::StdVectorFst &den_fst, - const differentiable_transform::DifferentiableTransform &transform, + const differentiable_transform::DifferentiableTransformMapped &transform, Nnet *nnet); /** Train on one minibatch. @@ -530,6 +526,21 @@ class NnetChainaTopTrainer { at the input are assumed to be consecutive. @param [in] top_subsampling_factor The number of frames with which 't' values at the output are separated. + @param [in] pdf_map This is either the empty vector (meaning: + the DifferentiableTransform object deals with pdf-ids + directly), or it is a map from pdf-ids to cluster-ids. + This would actually be obtained from build-tree-two-level + after building a two-level tree, and it would be stored + in the .ada object. The actual class labels that + the DifferentiableTransform object deals with, will + be the values stored in 'pfd_map' (i.e. these cluster-ids). + @param [in] num_classes Provided for checking purposes only: the + number of classes that the DifferentiableTransform object + expects. If pdf_map is empty we expect this to be the + same as the number of pdf-ids (and the ints in + post_at_output to be in the range [0, num_classes - 1]). + If pdf_map is nonempty, we expect this to be the same + as the maximum element in pdf_map, plus one. @param [out] post_at_input The posterior after padding and possibly subsampling. Should have the correct size but its elements are expected to be empty at entry. Like @@ -541,6 +552,8 @@ class NnetChainaTopTrainer { int32 num_sequences, int32 first_input_t, int32 top_subsampling_factor, + const std::vector &pdf_map, + int32 num_classes, Posterior *post_at_input); /** @@ -590,7 +603,7 @@ class NnetChainaTopTrainer { const NnetChainaTrainingOptions &opts_; chain::DenominatorGraph den_graph_; - const differentiable_transform::DifferentiableTransform &transform_; + const differentiable_transform::DifferentiableTransformMapped &transform_; CachingOptimizingCompiler compiler_; @@ -703,9 +716,8 @@ class NnetChainaBottomTrainer { NnetComputer *computer, CuMatrix *output_deriv); - - // Prints out the final stats, and return true if there was a nonzero count. - bool PrintTotalStats() const; + // Prints the max-change stats for the bottom nnet. + void PrintTotalStats() const; // Calls kaldi::nnet3::ConsolidateMemory() on nnet_ and delta_nnet_; we do // this after the first minibatch of training, to reduce fragmentation. diff --git a/src/nnet3a/notes.update b/src/nnet3a/notes.update index 1f0f9820b2a..d3009a1fdca 100644 --- a/src/nnet3a/notes.update +++ b/src/nnet3a/notes.update @@ -1,3 +1,80 @@ +=== + + Meta-info for dumping egs: + only really need tree,trans_mdl,normalization.fst,den.fst + + +=== + + +Things needed per language in order to dump raw egs: + + Configuration values: + - left and right acoustic context + - frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor, + alignment_subsampling_factor, constrained, compress, left_tolerance, + right_tolerance, lattice_lm_scale, lattice_prune_beam, acwt + + (Also: left_context_initial,right_context_initial... although + the use of these will make it harder to deal with setups with + little data per speaker). + + - tree, tree.map, 0.trans_mdl, normalization.fst (and probably den.fst + so we can save it in the egs dir). + + - Format of raw egs dir (we'll likely delete this right after creation): + + info.txt: + dir_type raw_chaina_egs + num_clusters 200 + num_chunks 120000 + num_leaves 6543 + frames_per_chunk 140,110,100 + # number of input frames, not including context + num_input_frames_tot 432143218 + left_context 10 + left_context_initial 10 + right_context 10 + right_context_initial 10 + ... various configuration values here... + + # note: tree.map is optional, since the egs don't depend on it, but it will generally + # be generated with the tree. + misc/{tree,tree.map,0.trans_mdl,normalization.fst,den.fst} + + egs.scp will contain encodings like: + -----v1 + + - Format of merged-egs dir + + dir_type merged_chaina_egs + chunks_per_spk 4 + .. otherwise like raw one. misc/ directory contains similar things. + + - Format of final-egs dir (might be merged). + info.txt: + dir_type final_chain_egs + langs english french + num_input_frames_tot 432143218 + num_scp_files 24 + frames_per_scp_file 143241 + chunks_per_spk 4 + num_chunks xxxx + + den_fsts + + +================= + + + + - contains egs.N.{scp,ark}, which might be links to files in the storage dir. + + egs.scp + +The + + From 97d6a755073caaec7f527d7e0dadbaf64a71fbe6 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 3 Jan 2019 20:44:00 -0800 Subject: [PATCH 52/87] [src,scripts,egs] Further progress --- egs/mini_librispeech/s5/conf/mfcc_hires2.conf | 14 + .../s5/local/chaina/data_prep_common.sh | 76 +++ .../s5/local/chaina/tuning/run_tdnn_1a.sh | 438 ++++++++++++++++++ egs/wsj/s5/steps/chaina/get_raw_egs.sh | 258 +++++++++++ .../chaina/internal/choose_egs_to_merge.py | 141 ++++++ egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 + egs/wsj/s5/steps/nnet3/chain/build_tree.sh | 35 +- egs/wsj/s5/steps/nnet3/xconfig_to_config.py | 104 +++++ egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 46 +- src/adapt/differentiable-transform-itf.cc | 9 + src/adapt/differentiable-transform-itf.h | 10 + src/chainbin/nnet3-chain-get-egs.cc | 6 +- src/nnet3/nnet-example-utils.cc | 3 +- src/nnet3a/nnet-chaina-training.cc | 2 +- src/nnet3a/nnet-chaina-training.h | 6 +- src/nnet3a/notes.update | 36 +- tools/config/common_path.sh | 1 + 17 files changed, 1166 insertions(+), 20 deletions(-) create mode 100644 egs/mini_librispeech/s5/conf/mfcc_hires2.conf create mode 100755 egs/mini_librispeech/s5/local/chaina/data_prep_common.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh create mode 100755 egs/wsj/s5/steps/chaina/get_raw_egs.sh create mode 100755 egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py create mode 100755 egs/wsj/s5/steps/nnet3/xconfig_to_config.py diff --git a/egs/mini_librispeech/s5/conf/mfcc_hires2.conf b/egs/mini_librispeech/s5/conf/mfcc_hires2.conf new file mode 100644 index 00000000000..2e8dc221d40 --- /dev/null +++ b/egs/mini_librispeech/s5/conf/mfcc_hires2.conf @@ -0,0 +1,14 @@ +# config for high-resolution MFCC features, intended for 'chaina' neural network +# training. These '..2.conf' setups are intended to have the --modified=true +# configuration value. + +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +# Will soon add: --modified=true +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh b/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh new file mode 100755 index 00000000000..087756a9ea0 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/chaina/run_tdnn.sh and +# similar scripts. It contains the common feature preparation and +# lattice-alignment preparation parts of the chaina training. +# See those scripts for examples of usage. + +stage=0 +train_set=train_clean_5 +test_sets="dev_clean_2" +gmm=tri3b + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +# Our default data augmentation method is 3-way speed augmentation followed by +# volume perturbation. We are looking into better ways of doing this, +# e.g. involving noise and reverberation. + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment. _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires2/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires2 || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires2.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + + +exit 0 diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..0ed32ea20aa --- /dev/null +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -0,0 +1,438 @@ +#!/bin/bash + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 + + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_spk=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/0 will contain the models for iteration zero. +mkdir -p $dir/0/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=256 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Splice(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn dim=256 + output name=output input=linear_bottleneck +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/0/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain input=prefinal-si-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent input=prefinal-si-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config 0 - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/0/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/0/info.txt < $dir/0/info.txt +fi + + +if [ $stage -le 17 ]; then + # Make phone LM and denominator and normalization FST + mkdir -p $dir/den_fsts/log + + # We may later reorganize this. + cp $tree_dir/tree $dir/default.tree + + echo "$0: creating phone language-model" + $cmd $dir/den_fsts/log/make_phone_lm_default.log \ + chain-est-phone-lm --num-extra-lm-states=2000 \ + "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \ + $dir/den_fsts/default.phone_lm.fst + + echo "$0: creating denominator FST" + $cmd $dir/den_fsts/log/make_den_fst.log \ + chain-make-den-fst $dir/default.tree $dir/0/default.mdl $dir/den_fsts/default.phone_lm.fst \ + $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1; +fi + + +model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/0/info.txt) +model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/0/info.txt) +egs_left_context=$[[model_left_context+egs_extra_left_context]] +egs_right_context=$[[model_right_context+egs_extra_right_context]] + + +if [ $stage -le 18 ]; then + echo "$0: about to dump raw egs." + # Dump raw egs. + steps/chaina/get_raw_egs.sh \ + --lang "default" \ + --left-context $egs_left_context \ + --right-context $egs_right_context \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-egs 150 \ + ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs +fi + +if [ $stage -le 19 ]; then + echo "$0: about to process egs" + steps/chaina/process_egs.sh \ + --chunks-per-spk ${chunks_per_spk} ${dir}/raw_egs ${dir}/processed_egs +fi + + + for d in $dir/raw_egs $dir/merged_egs; do + mkdir -p $d + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage + fi + done + + + mkdir -p $dir/raw_egs + steps/chaina/get_raw_egs.sh --lang default \ + ${train_data_dir} $dir exp/tri3_lats $dir/raw_egs +fi + + # Work out the model + # The following script is equivalent to doing something like the + # following: + # cat > $dir/0/info.txt < $dir/0/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l =0, right-context for last chunk of an utterance. +right_context_final=-1 # if >=0, right-context for last chunk of an utterance. + +compress=true # set this to false to disable compression (e.g. if you want to + # see whether results are affected). Note: if the features on + # disk were originally compressed, nnet3-chain-get-egs will dump + # compressed features regardless (since there is no further loss + # in that case). + +lang=default # the language name. will usually be 'default' in single-language + # setups. Requires because it's part of the name of some of + # the input files. + +right_tolerance= # chain right tolerance == max label delay. Only relevant if + # constrained=true. At frame rate of alignments. Code + # default is 5. +left_tolerance= # chain left tolerance (versus alignments from lattices). + # Only relevant if constrained=true. At frame rate of + # alignments. Code default is 5. + +stage=0 +max_jobs_run=40 # This should be set to the maximum number of + # nnet3-chain-get-egs jobs you are comfortable to run in + # parallel; you can increase it if your disk speed is + # greater and you have more machines. + + +srand=0 # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs + +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions + # This is 0 by default for conventional supervised training, + # but may be close to 1 for the unsupervised part of the data + # in semi-supervised training. The optimum is usually + # 0.5 for unsupervised data. +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. + +acwt=0.1 # For pruning. Should be, for instance, 1.0 for chain lattices. +deriv_weights_scp= + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/chaina/tdnn1a_sp exp/tri3_lats exp/chaina/tdnn1a_sp/raw_egs" + echo "" + echo "From , 0/.mdl (for the transition-model), .tree (the tree), " + echo " den_fsts/.den.fst, and den_fsts/.normalization.fst (the normalization " + echo " FST, derived from the denominator FST echo are read (where is specified" + echo " by the --lang option (its default values is 'default')" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options (alternative to this" + echo " # command line)" + echo " --max-jobs-run # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " + echo " --lang # Name of the language, determines names of some inputs." + echo " --frames-per-chunk # number of supervised frames per chunk on disk" + echo " # ... may be a comma separated list, but we advise a single" + echo " # number in most cases, due to interaction with the need " + echo " # to group egs from the same speaker into groups." + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --left-context-initial # Left-context for first chunk of an utterance" + echo " --right-context-final # Right-context for last chunk of an utterance" + echo " --lattice-lm-scale # If supplied, the graph/lm weight of the lattices will be " + echo " # used (with this scale) in generating supervisions" + echo " --lattice-prune-beam # If supplied, the lattices will be pruned to this beam, " + echo " # before being used to get supervisions." + echo " --acwt # Acoustic scale -- should be acoustic scale at which the " + echo " # supervision lattices are to be interpreted. Affects pruning" + echo " --deriv-weights-scp # If supplied, adds per-frame weights to the supervision." + echo " # (e.g., might be relevant for unsupervised training)." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + exit 1; +fi + +data=$1 +chaindir=$2 +latdir=$3 +dir=$4 + +tree=$chaindir/${lang}.tree +trans_mdl=$chaindir/0/${lang}.mdl # contains the transition model and a nnet. +normalization_fst=$chaindir/0/${lang}.normalization.fst +den_fst=$chaindir/0/${lang}.den.fst + +for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ + $tree $trans_mdl $normalization_fst $den_fst; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=$(cat $latdir/num_jobs) || exit 1 +if [ -f $latdir/per_utt ]; then + sdata=$data/split${nj}utt + utils/split_data.sh --per-utt $data $nj +else + sdata=$data/split$nj + utils/split_data.sh $data $nj +fi + +mkdir -p $dir/log $dir/misc + +cp $tree $dir/misc/ +copy-transition-model $trans_mdl $dir/misc/${lang}.trans_mdl +cp $normalization_fst $den_fst $dir/misc/ +cp data/utt2spk $dir/misc/ +if [ -f $data/utt2uniq ]; then + cp $data/utt2uniq $dir/misc/ +elif [ -f $dir/misc/utt2uniq ]; then + rm $dir/misc/utt2uniq +fi + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $nj); do echo $dir/cegs.$x.ark; done) +fi + + +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" +if [ ! -z $lattice_prune_beam ]; then + if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then + lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" + else + lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |" + fi +fi + +egs_opts="--long-key=true --left-context=$left_context --right-context=$right_context --num-frames=$frames_per_chunk --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" +[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" + +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" + + +chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor" +[ ! -z $right_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance" + +[ ! -z $left_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" + +if ! $constrained; then + # e2e supervision + chain_supervision_all_opts="$chain_supervision_all_opts --convert-to-pdfs=false" + egs_opts="$egs_opts --transition-model=$chaindir/0.trans_mdl" +fi + +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" + + normalization_fst_scale=$(perl -e " + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; exit(1); + } + print (1.0 - $lattice_lm_scale);") || exit 1 + egs_opts="$egs_opts --normalization-fst-scale=$normalization_fst_scale" +fi + +if [ $stage -le 0 ]; then + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \ + "$lats_rspecifier" ark:- \| \ + chain-get-supervision $chain_supervision_all_opts \ + $dir/misc/tree $dir/misc/${lang}.trans_mdl ark:- ark:- \| \ + nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ + "$normalization_fst" $sdata/JOB/feats.scp ark,s,cs:- \ + ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp || exit 1; +fi + + +if [ $stage -le 1 ]; then + frames_and_chunks=$(for n in $(seq nj); do cat $dir/log/get_egs.$n.log; done | \ + perl -e '$nf=0;$nc=0; while() { if(m/with total length (\d+) frames.+ into (\d+) chunks/) { $nf += $1; $nc += $2; } print "$nf $nc";') + num_frames=$(echo $frames_and_chunks || awk '{print $1}') + num_chunks=$(echo $frames_and_chunks || awk '{print $2}') + frames_per_chunk_avg=$[$num_frames/$num_chunks] + feat_dim=$(feat-to-dim scp:$sdata/JOB/feats.scp -) + num_leaves=$(tree-info $chaindir/tree | awk '/^num-pdfs/ {print $2}') + if [ $left_context_initial -lt 0 ]; then + left_context_initial=$left_context + fi + if [ $right_context_final -lt 0 ]; then + right_context_initial=$right_context + fi + + cat >$dir/info.txt < $dir/all.scp +fi + +echo "$0: Finished preparing raw egs" diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py new file mode 100755 index 00000000000..7c0dc11d989 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Johns Hopkins University (author: Daniel Povey) +# License: Apache 2.0. + +import os +import argparse +import sys +import re + + + + +parser = argparse.ArgumentParser(description="Chooses groups of examples to merge into groups " + "of size given by the --chunks-per-spk option, based on speaker " + "information (preferentially, chunks from the same utterance " + "and, if possible, the same speaker, get combined into " + "groups). This script also computes a held-out subset of...", + epilog="E.g. " + sys.argv[0] + "*** TODO *** ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + +# Also maybe have --num-repeats, which must divide --chunks-per-spk? Can be +# used to divide data into different groups than the default ones. + + +parser.add_argument("--chunks-per-spk", type=int, default=4, + help="Number of chunks per speaker in the final egs (actually " + "means the number of chunks per group of chunks, and they are " + "only preferentially taken from the same speaker.") +parser.add_argument("--num-repeats", type=int, default=1, + "The number of times the data is to be repeated. Must divide " + "--chunks-per-spk. Suggest to try only 1 or 2. The idea " + "is to divide chunks into groups in different ways, to give " + "more variety to the egs (since the adaptation information " + "will differ.") +parser.add_argument("--heldout-data-selection-proportion", type=float, + default=0.2, + "This parameter governs the selection of the heldout " + "subset and the statistically matched training subset. " + "It does not affect the size of that subset, but only " + "affects what pool the examples are drawb from. " + "Smaller values of this mean that the heldout groups " + "will be preferentially drawn from groups that " + "'contaminate' the least number of other groups, " + "and so require the least data to be removed from the " + "training set. Setting this to 1.0 would mean that " + "the heldout subset is drawn completely at random " + "(which might be more wasteful of training data, but " + "gives a selection that's statistically more " + "representative).") +parser.add_argument("--num-heldout-groups", type=int, default=200, + "Number of utterance groups " + "that will go in the heldout subset (and in the " + "statistically matched training subset)") +parser.add_argument("--utt2uniq", type=str, default='', + "File used in setups with data " + "augmentation, that maps from utterance-ids to the " + "pre-augmentation utterance-id. The reason it's needed " + "is to ensure that the heldout set is properly held " + "out (i.e., that different versions of those utterances " + "weren't trained on. If not specified, we assume the " + "identity map.") +parser.add_argument("--scp-in", type=str, required=True, + "The scp file in, likely containing chain egs. The " + "keys are expected to be of the form: " + "'-----v1', " + "where the left_context, num_frames and right_context are required to be the " + "same in order for keys to be in a group (note: it's best if the " + "--extra-left-context-initial and --extra-right-context-final options " + "are not used, and if the --frames-per-chunk is a single number, in " + "order to prevent this constraint from splitting up the utterances from " + "a single speaker") +parser.add_argument("--training-data-out", type=str, required=True, + "The output file containing the chunks that are to be grouped; each " + "line will contain --chunks-per-spk (e.g. 4) rxfilenames, obtained " + "from the second field of the input --scp-in file.") +parser.add_argument("--heldout-subset-out", type=str, required=True, + "This is the name of the file to which the heldout data subset " + "will be written; the format is the same as --training-data-out.") +parser.add_argument("--training-subset-out", type=str, required=True, + "This is the name of the file to which the statistically matched " + "(to --heldout-subset-out) set of training data will be written") + + + + +""" +Notes on plan for how to implement this (we can keep this as documentation). +This is a rather simple plan and we might later implement something more +sophisticated that does a better job of keeping chunks from the same utterance +or speaker together. + +It's pretty trivial. Basically we rely on the fact that the input utterances +come in in sorted order (so utterances from adjacent speakers will naturally be +together. + +We read the entries in the input scp file as a list, keeping them in the order +they were in the input. We split that list into distinct sub-lists, each with a unique value +of --, although in the expected +case there will be just one such sub-list. + +In the case where --chunks-per-spk=4 and --num-repeats=1, the groups of +chunks would then just be (and we do this for each of the sub-lists): +the first 4 chunks; the second 4 chunks; and so on. In the case where +--chunks-per-spk=4 and --num-repeats=2, we'd obtain the groups as above, then +we'd discard the first 2 chunks of each sub-list and repeat the process, giving +us twice the original number of groups. + +Once we have the groups as above, we need to figure out the subset of +size --num-heldout-groups which will be chosen to appear in the output +file --heldout-subset-out. We'll also be choosing another subset of +the same size to appear in the file --training-subset-out; and we'll +be excluding some groups from the output --training-data-out (any +utterances that appeared in --heldout-subset-out, or which were linked +with such utterances via the --utt2uniq map, will be excluded). + +The way we choose the groups to hold out is as follows. In cases where +the utt2uniq file is undefined, treat it as the identity map. +We are given list of groups. We compute, for each group, the set of +utterances represented in it, and from that, the set of "uniq" +values (a "uniq" value is a string, representing a pre-augmentation +utterance-id). For each "uniq" value, we compute the set of +group-ids in which it was represented. For a given group, we +take the union of all those sets for its "uniq" value, and remove +its own group-id. The size of this set gives us a number >= 0 of the +number of other groups we'd have to exclude if we were to include +this particular group in the heldout subset. It might be zero only in +the case where there was no augmentation and --num-repeats=1, and +some particular utterance had been split into exactly 4 chunks which +all ended up in the same group. + + +num_groups which is +the number of groups in which that "uniq" value is represented. +Then, for each group, we can compute + + + + + +""" diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index ede0201f572..e992ba73e78 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -27,6 +27,7 @@ 'relu-batchnorm-layer' : xlayers.XconfigBasicLayer, 'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer, 'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer, + 'batchnorm-layer' : xlayers.XconfigBasicLayer, 'sigmoid-layer' : xlayers.XconfigBasicLayer, 'tanh-layer' : xlayers.XconfigBasicLayer, 'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer, diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh index 757963f13a7..1782fb817f9 100755 --- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh @@ -27,6 +27,11 @@ leftmost_questions_truncate=-1 # note: this option is deprecated and has no eff tree_stats_opts= cluster_phones_opts= repeat_frames=false +num_clusters= # e.g. 200; can be used if you want a 2-level tree, and + # in that case the file tree.map will be output, which + # maps from the leaves to (effectively) clusters of + # leaves. We'll also output the file num_clusters which is + # the number of these clusters (normally == the option). # End configuration section. echo "$0 $@" # Print the command line for logging @@ -58,6 +63,9 @@ if [ $# != 5 ]; then echo " --frame-subsampling-factor # Factor (e.g. 3) controlling frame subsampling" echo " # at the neural net output, so the frame rate at" echo " # the output is less than at the input." + echo " --num-clusters # Default: none. E.g. 200; can be used if you want" + echo " # a 2-level tree. Used in 'chaina' setup. The file" + echo " # tree.map will be output in this case." exit 1; fi @@ -168,11 +176,28 @@ if [ $stage -le -3 ] && $train_tree; then compile-questions $context_opts $lang/topo \ $dir/questions.int $dir/questions.qst || exit 1; - echo "$0: Building the tree" - $cmd $dir/log/build_tree.log \ - build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ - --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ - $dir/questions.qst $lang/topo $dir/tree || exit 1; + if [ -z "$num_clusters" ]; then + # normal case: single tree. + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; + else + if ! [ $num_clusters -lt $numleaves ]; then + echo "$0: --num-clusters=$num_clusters must be less than num-leaves=$numleaves" + exit 1; + fi + $cmd $dir/log/build_tree.log \ + build-tree-two-level $context_opts --verbose=1 \ + --max-leaves-first=$num_clusters --max-leaves-second=$numleaves \ + $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree \ + "|copy-int-vector --binary=false - $dir/tree.map" || exit 1; + num_clusters_effective=$(cat $dir/tree.map awk '{nc=0; for(n=2;n=nc) nc=1+$n; }END{print nc}') + echo $num_clusters_effective >$dir/num_clusters + echo "$0: you requested --num-clusters=$num_clusters, you got 2nd-level tree num-leaves=$num_clusters_effective" + fi fi if [ $stage -le -2 ]; then diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_config.py b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py new file mode 100755 index 00000000000..e234ea732d4 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +# Copyright 2016-2018 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2017 Google Inc. (vpeddinti@google.com) +# Apache 2.0. + +# we're using python 3.x style print but want it to work in python 2.x, + +import argparse +import os +import sys +from collections import defaultdict + +sys.path.insert(0, 'steps/') +# the following is in case we weren't running this from the normal directory. +sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/') + +import libs.nnet3.xconfig.parser as xparser +import libs.common as common_lib + + +def get_args(): + # we add compulsory arguments as named arguments for readability + parser = argparse.ArgumentParser( + description="Reads an xconfig file and creates config files " + "for neural net creation and training", + epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples') + parser.add_argument('--xconfig-file', required=True, + help='Filename of input xconfig file') + parser.add_argument('--existing-model', + help='Filename of previously trained neural net ' + '(e.g. final.mdl) which is useful in case of ' + 'using nodes from list of component-nodes in ' + 'already trained model ' + 'to generate new config file for new model.' + 'The context info is also generated using ' + 'a model generated by adding final.config ' + 'to the existing model.' + 'e.g. In Transfer learning: generate new model using ' + 'component nodes in existing model.') + parser.add_argument('--config-file-out', required=True, + help='Filename to write nnet config file.'); + parser.add_argument('--nnet-edits', type=str, default=None, + action=common_lib.NullstrToNoneAction, + help="""This option is useful in case the network you + are creating does not have an output node called + 'output' (e.g. for multilingual setups). You can set + this to an edit-string like: 'rename-node old-name=xxx + new-name=output' if node xxx plays the role of the + output node in this network. This is only used for + computing the left/right context.""") + + print(' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + + return args + + + +def write_config_file(config_file_out, all_layers): + # config_basename_to_lines is map from the basename of the + # config, as a string (i.e. 'ref', 'all', 'init') to a list of + # strings representing lines to put in the config file. + config_basename_to_lines = defaultdict(list) + + for layer in all_layers: + try: + pairs = layer.get_full_config() + for config_basename, line in pairs: + config_basename_to_lines[config_basename].append(line) + except Exception as e: + print("{0}: error producing config lines from xconfig " + "line '{1}': error was: {2}".format(sys.argv[0], + str(layer), repr(e)), + file=sys.stderr) + # we use raise rather than raise(e) as using a blank raise + # preserves the backtrace + raise + + with open(config_file_out, 'w') as f: + print('# This file was created by the command:\n' + '# {0} '.format(sys.argv), file=f) + lines = config_basename_to_lines['final'] + for line in lines: + print(line, file=f) + + +def main(): + args = get_args() + existing_layers = [] + if args.existing_model is not None: + existing_layers = xparser.get_model_component_info(args.existing_model) + all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers) + write_config_file(args.config_file_out, all_layers) + + +if __name__ == '__main__': + main() + + +# test: +# (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)') >xconfig; steps/nnet3/xconfig_to_config.py --xconfig-file=xconfig --config-file-out=foo diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index 3b8dc82fe48..149ea3b63bf 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -39,8 +39,13 @@ def get_args(): 'to the existing model.' 'e.g. In Transfer learning: generate new model using ' 'component nodes in existing model.') - parser.add_argument('--config-dir', required=True, - help='Directory to write config files and variables') + parser.add_argument('--config-dir', required=False, + help='Directory to write config files and variables; either ' + 'this or --config-out must be specified.') + parser.add_argument('--config-out', required=False, + help='Filename to write nnet config file. This is the ' + 'simplified interface that does not support lda-layer. ' + 'Either this or --config-dir must be supplied.') parser.add_argument('--nnet-edits', type=str, default=None, action=common_lib.NullstrToNoneAction, help="""This option is useful in case the network you @@ -141,7 +146,7 @@ def write_expanded_xconfig_files(config_dir, all_layers): def get_config_headers(): """ This function returns a map from config-file basename - e.g. 'init', 'ref', 'layer1' to a documentation string that goes + e.g. 'init', 'ref', 'final' to a documentation string that goes at the top of the file. """ # resulting dict will default to the empty string for any config files not @@ -230,6 +235,41 @@ def write_config_files(config_dir, all_layers): raise +# This is an alternative to 'write_config_files' where a single output +# file is desired (would correspond to the output 'final.config' in the +# normal setup). In this case, things like LDA and presoftmax are not +# supported. +def write_single_config_file(config_file_out, all_layers): + # config_basename_to_lines is map from the basename of the + # config, as a string (i.e. 'ref', 'all', 'init') to a list of + # strings representing lines to put in the config file. + config_basename_to_lines = defaultdict(list) + + config_basename_to_header = get_config_headers() + + for layer in all_layers: + try: + pairs = layer.get_full_config() + for config_basename, line in pairs: + config_basename_to_lines[config_basename].append(line) + except Exception as e: + print("{0}: error producing config lines from xconfig " + "line '{1}': error was: {2}".format(sys.argv[0], + str(layer), repr(e)), + file=sys.stderr) + # we use raise rather than raise(e) as using a blank raise + # preserves the backtrace + raise + + + with open(config_file_out, 'w') as f: + header = config_basename_to_header['final'] + print(header, file=f) + lines = config_basename_to_lines['final'] + for line in lines: + print(line, file=f) + + def add_nnet_context_info(config_dir, nnet_edits=None, existing_model=None): """Create the 'vars' file that specifies model_left_context, etc.""" diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc index a8fb3cbed61..0e195b97a2f 100644 --- a/src/adapt/differentiable-transform-itf.cc +++ b/src/adapt/differentiable-transform-itf.cc @@ -165,5 +165,14 @@ void DifferentiableTransformMapped::Check() const { transform->NumClasses())); } +std::string DifferentiableTransformMapped::Info() const { + KALDI_ASSERT(transform != NULL); + std::ostringstream os; + os << "dim=" << transform->Dim() << std::endl + << "num-classes=" << transform->NumClasses() << std::endl + << "num-pdfs=" << NumPdfs() << std::endl; + return os.str(); +} + } // namespace differentiable_transform } // namespace kaldi diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index 8a8c4b5bd78..fe5240f9618 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -396,6 +396,16 @@ struct DifferentiableTransformMapped { void Write(std::ostream &os, bool binary) const; + // Returns a string something like: + // dim=256 + // num-classes=200 + // num-pdfs=6391 + // ... in future we will likely add more information, but for now you can get it by + // copying to text form and looking at it directly. + // the "num-classes" is transform->NumClasses(), and "num-pdfs" is + // pdf_map.size() if pdf_map is nonempty; else, transform->NumClasses(). + std::string Info() const; + // Check that the dimensions are consistent, i.e. pdf_map.empty() or // transform->NumClasses() == max-element-in-pdf_map + 1. void Check() const; diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 85e26bc7f30..23291eac167 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -286,9 +286,9 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " "in compressed format (recommended). Update: this is now " - "only relevant if the features being read are un-compressed; " - "if already compressed, we keep we same compressed format when " - "dumping-egs."); + "only relevant if the features being read are uncompressed; " + "if already compressed, we keep the same compressed format when " + "dumping egs."); po.Register("ivectors", &online_ivector_rspecifier, "Alias for " "--online-ivectors option, for back compatibility"); po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 555c83416c3..a657a690a77 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -363,7 +363,8 @@ UtteranceSplitter::~UtteranceSplitter() { KALDI_LOG << "Split " << total_num_utterances_ << " utts, with " << "total length " << total_input_frames_ << " frames (" << (total_input_frames_ / 360000.0) << " hours assuming " - << "100 frames per second)"; + << "100 frames per second) into " << total_num_chunks + << " chunks."; float average_chunk_length = total_frames_in_chunks_ * 1.0 / total_num_chunks_, overlap_percent = total_frames_overlap_ * 100.0 / total_input_frames_, output_percent = total_frames_in_chunks_ * 100.0 / total_input_frames_, diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 369b28e6cf8..3c13043b40c 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -83,7 +83,7 @@ NnetChainaModels::LanguageInfo *NnetChainaModels::GetInfoForLang( std::string model_filename, den_fst_filename, transform_filename; GetPathname(model_dir_, lang, "mdl", &model_filename); - GetPathname(den_fst_dir_, lang, "fst", &den_fst_filename); + GetPathname(den_fst_dir_, lang, "den.fst", &den_fst_filename); GetPathname(transform_dir_, lang, "ada", &transform_filename); { diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index a5b041e2848..0c2a7e2073c 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -130,7 +130,7 @@ class NnetChainaModels { /bottom.raw should exist, and then for each language name (e.g. "english"), the following files should exist: - /english.mdl /english.fst /english.ada + /english.mdl /english.den.fst /english.ada There is no requirement that all these directories be distinct. In practice, the language name will be either "default", in the @@ -166,7 +166,7 @@ class NnetChainaModels { "...?lang=xxx" in the key when reading the egs, see ParseFromQueryString() in nnet-chain-utils.h. @param [in] den_fst_ir Directory where we'll find the denominator - FST .fst for each language present in + FST .den.fst for each language present in the egs. @param [in] transform_dir Directory where we'll find the transforms (of type DifferentiableTransformItf), @@ -234,7 +234,7 @@ class NnetChainaModels { // stores a TransitionModel. TransitionModel trans_model; AmNnetSimple am_nnet; - // den_fst comes from /.fst + // den_fst comes from /.den.fst fst::StdVectorFst den_fst; // transform comes from /.ada differentiable_transform::DifferentiableTransformMapped transform; diff --git a/src/nnet3a/notes.update b/src/nnet3a/notes.update index d3009a1fdca..c01aa208d50 100644 --- a/src/nnet3a/notes.update +++ b/src/nnet3a/notes.update @@ -25,18 +25,19 @@ Things needed per language in order to dump raw egs: - Format of raw egs dir (we'll likely delete this right after creation): info.txt: + dir_type raw_chaina_egs - num_clusters 200 num_chunks 120000 num_leaves 6543 frames_per_chunk 140,110,100 - # number of input frames, not including context num_input_frames_tot 432143218 left_context 10 left_context_initial 10 right_context 10 right_context_initial 10 + ... various configuration values here... + ... need utt2spk and utt2uniq file ... # note: tree.map is optional, since the egs don't depend on it, but it will generally # be generated with the tree. @@ -61,7 +62,23 @@ Things needed per language in order to dump raw egs: chunks_per_spk 4 num_chunks xxxx - den_fsts + den_fsts/ -> lang.fst + norm_fsts/ -> lang.fst + trees/lang.tree, ?lang.tree.map + trans_models/lang.trans_mdl + + + - Format of chain-training-input dir: + + - Two purposes: as input to the model training, and (if single language) as input for getting the egs? + + Need: + - the input models (bottom and top-per-language), the input .ada objects + - The trees per language? + - Options and the like + - List of languages + - Left and right context required for egs + - extra left/right context??? ================= @@ -89,12 +106,23 @@ Plans for binaries. ================ + steps/chaina/init_den_fst.sh + make den.fst, normalization.fst + + # Maybe just use nnet3-init in the scripts, to initialize the nnets, and + # copy them where they are needed. + + + # What's needed in a chain dir? + 0/bottom.raw,lang.mdl,lang.ada + + steps/chaina/init_chain_dir.sh make den.fst, normalization.fst, bottom.config, top.config, bottom.raw, top.raw -init.config, init.raw, 0.trans_mdl, + init.config, init.raw, 0.trans_mdl, final.config (but not 0.raw yet, might need egs first). diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh index 9a7ae2d9b29..a8d454b3c06 100644 --- a/tools/config/common_path.sh +++ b/tools/config/common_path.sh @@ -22,4 +22,5 @@ ${KALDI_ROOT}/src/rnnlmbin:\ ${KALDI_ROOT}/src/sgmm2bin:\ ${KALDI_ROOT}/src/sgmmbin:\ ${KALDI_ROOT}/src/tfrnnlmbin:\ +${KALDI_ROOT}/src/nnet3abin:\ $PATH From 0b443b20ca568cbf5ea856ea6d21e886886bc907 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Fri, 4 Jan 2019 15:04:15 -0500 Subject: [PATCH 53/87] Add the --use-query-string option to nnet3-chain-merge-egs --- src/chainbin/nnet3-chain-merge-egs.cc | 11 ++++++++++- src/nnet3/nnet-chain-example.cc | 8 +++++++- src/nnet3/nnet-chain-example.h | 6 ++++++ src/nnet3/nnet-example-utils.cc | 2 +- src/nnet3/nnet-example-utils.h | 13 +++++++++++-- 5 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc index a3686d2fc30..926cda76cf3 100644 --- a/src/chainbin/nnet3-chain-merge-egs.cc +++ b/src/chainbin/nnet3-chain-merge-egs.cc @@ -64,7 +64,16 @@ int main(int argc, char *argv[]) { ChainExampleMerger merger(merging_config, &example_writer); for (; !example_reader.Done(); example_reader.Next()) { const NnetChainExample &cur_eg = example_reader.Value(); - merger.AcceptExample(new NnetChainExample(cur_eg)); + NnetChainExample *cur_eg_copy = new NnetChainExample(cur_eg); + if (merging_config.use_query_string) { + std::string key = example_reader.Key(); + int pos = key.find('?'); + if (pos != std::string::npos) { + std::string query = key.substr(pos + 1, key.size() - pos - 1); + cur_eg_copy->bucket = query; + } + } + merger.AcceptExample(cur_eg_copy); } // the merger itself prints the necessary diagnostics. merger.Finish(); diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index a0614da4916..e5aa13b848b 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -187,6 +187,7 @@ void NnetChainExample::Read(std::istream &is, bool binary) { void NnetChainExample::Swap(NnetChainExample *other) { inputs.swap(other->inputs); outputs.swap(other->outputs); + std::swap(bucket, other->bucket); } void NnetChainExample::Compress() { @@ -420,12 +421,13 @@ size_t NnetChainExampleStructureHasher::operator () ( const NnetChainExample &eg) const noexcept { // these numbers were chosen at random from a list of primes. NnetIoStructureHasher io_hasher; + StringHasher string_hasher; size_t size = eg.inputs.size(), ans = size * 35099; + ans += string_hasher(eg.bucket); for (size_t i = 0; i < size; i++) ans = ans * 19157 + io_hasher(eg.inputs[i]); for (size_t i = 0; i < eg.outputs.size(); i++) { const NnetChainSupervision &sup = eg.outputs[i]; - StringHasher string_hasher; IndexVectorHasher indexes_hasher; ans = ans * 17957 + string_hasher(sup.name) + indexes_hasher(sup.indexes); @@ -436,6 +438,8 @@ size_t NnetChainExampleStructureHasher::operator () ( bool NnetChainExampleStructureCompare::operator () ( const NnetChainExample &a, const NnetChainExample &b) const { + if (a.bucket != b.bucket) + return false; NnetIoStructureCompare io_compare; if (a.inputs.size() != b.inputs.size() || a.outputs.size() != b.outputs.size()) @@ -518,6 +522,8 @@ void ChainExampleMerger::WriteMinibatch( MergeChainExamples(config_.compress, egs, &merged_eg); std::ostringstream key; key << "merged-" << (num_egs_written_++) << "-" << minibatch_size; + if (!(*egs)[0].bucket.empty()) + key << "?" << (*egs)[0].bucket; writer_->Write(key.str(), merged_eg); } diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index 5122739a38c..f96dc81369f 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -128,6 +128,12 @@ struct NnetChainExample { /// be just one member with name == "output". std::vector outputs; + /// This relates to the '--use-query-string' option for merging. Examples + /// with different values of 'bucket' won't be merged together. Note that + /// this member variable is not written or read (in the Write/Read functions) + /// as it's not a permanent part of an eg. It's only used in the merging code. + std::string bucket; + void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index a657a690a77..f837ce27c66 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -363,7 +363,7 @@ UtteranceSplitter::~UtteranceSplitter() { KALDI_LOG << "Split " << total_num_utterances_ << " utts, with " << "total length " << total_input_frames_ << " frames (" << (total_input_frames_ / 360000.0) << " hours assuming " - << "100 frames per second) into " << total_num_chunks + << "100 frames per second) into " << total_num_chunks_ << " chunks."; float average_chunk_length = total_frames_in_chunks_ * 1.0 / total_num_chunks_, overlap_percent = total_frames_overlap_ * 100.0 / total_input_frames_, diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 52b2ebbf904..0553eeb3d82 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -88,7 +88,6 @@ struct ExampleGenerationConfig { int32 frame_subsampling_factor; std::string num_frames_str; - // The following parameters are derived parameters, computed by // ComputeDerived(). @@ -325,12 +324,14 @@ class ExampleMergingConfig { std::string measure_output_frames; // for back-compatibility, not used. std::string minibatch_size; std::string discard_partial_minibatches; // for back-compatibility, not used. + bool use_query_string; ExampleMergingConfig(const char *default_minibatch_size = "256"): compress(false), measure_output_frames("deprecated"), minibatch_size(default_minibatch_size), - discard_partial_minibatches("deprecated") { } + discard_partial_minibatches("deprecated"), + use_query_string(false) { } void Register(OptionsItf *po) { po->Register("compress", &compress, "If true, compress the output examples " @@ -354,6 +355,14 @@ class ExampleMergingConfig { "--minibatch-size=128=64:128,256/256=32:64,128. Egs are given " "minibatch-sizes based on the specified eg-size closest to " "their actual size."); + po->Register("use-query-string", &use_query_string, "If true, the part of " + "the key name after the final '?' in the string (if one " + "is present) will be required to match when determining " + "which egs may be merged (so only egs with the same text " + "after the '?' will be merged), and the key used in the " + "output will end with the same query string, including " + "the '?'. An example query string is: " + "'?lang=english&tw=0.5&bw=1.0'"); } From 195168f63efc778111a05d3a7200d7ad595226bb Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 3 Jan 2019 22:03:35 -0800 Subject: [PATCH 54/87] [scripts] More documentation for choose_egs_to_merge.py --- .../chaina/internal/choose_egs_to_merge.py | 78 +++++++++++-------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py index 7c0dc11d989..886ca1d974f 100755 --- a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -85,26 +85,31 @@ """ -Notes on plan for how to implement this (we can keep this as documentation). +Notes on plan for how to implement this (we can keep this as documentation, but +we'll maybe move some of it around when things get implemented). + This is a rather simple plan and we might later implement something more sophisticated that does a better job of keeping chunks from the same utterance -or speaker together. +or the same speaker together. -It's pretty trivial. Basically we rely on the fact that the input utterances -come in in sorted order (so utterances from adjacent speakers will naturally be -together. +Basically we rely on the fact that the input utterances come in in sorted order +(so utterances from adjacent speakers will naturally be together. We read the entries in the input scp file as a list, keeping them in the order -they were in the input. We split that list into distinct sub-lists, each with a unique value -of --, although in the expected -case there will be just one such sub-list. +they were in the input (which will naturally keep together chunks from the +same utterance and utterances from the same speaker, since the raw egs were +not randomized). We split that list into distinct sub-lists, each with a unique value +of --. In the normal case +there will be just one such sub-list. In the case where --chunks-per-spk=4 and --num-repeats=1, the groups of chunks would then just be (and we do this for each of the sub-lists): the first 4 chunks; the second 4 chunks; and so on. In the case where --chunks-per-spk=4 and --num-repeats=2, we'd obtain the groups as above, then we'd discard the first 2 chunks of each sub-list and repeat the process, giving -us twice the original number of groups. +us twice the original number of groups. If you want you can just +assert that --num-repeats is either 1 or 2 for now; higher values don't +really make sense with the current approach for choosing groups. Once we have the groups as above, we need to figure out the subset of size --num-heldout-groups which will be chosen to appear in the output @@ -114,28 +119,37 @@ utterances that appeared in --heldout-subset-out, or which were linked with such utterances via the --utt2uniq map, will be excluded). -The way we choose the groups to hold out is as follows. In cases where -the utt2uniq file is undefined, treat it as the identity map. -We are given list of groups. We compute, for each group, the set of -utterances represented in it, and from that, the set of "uniq" -values (a "uniq" value is a string, representing a pre-augmentation -utterance-id). For each "uniq" value, we compute the set of -group-ids in which it was represented. For a given group, we -take the union of all those sets for its "uniq" value, and remove -its own group-id. The size of this set gives us a number >= 0 of the -number of other groups we'd have to exclude if we were to include -this particular group in the heldout subset. It might be zero only in -the case where there was no augmentation and --num-repeats=1, and -some particular utterance had been split into exactly 4 chunks which -all ended up in the same group. - - -num_groups which is -the number of groups in which that "uniq" value is represented. -Then, for each group, we can compute - - - - +The way we choose the groups to appear in --heldout-subset-out is as follows. +Firstly: in cases where the utt2uniq file is undefined, treat it as the identity +map. We are given list of groups. We compute, for each group, the set of +utterances represented in it, and from that, the set of "uniq" values (a "uniq" +value is a string, representing a pre-augmentation utterance-id). For each +"uniq" value, we will compute the set of group-ids in which it was represented. +For a given group, we take the union of all those sets for its "uniq" value, and +remove its own group-id; this gives us the set of other groups that share a +pre-augmentation utterance in common with this group. This set might be empty +only in the case where there was no augmentation and --num-repeats=1, and some +particular utterance had been split into exactly 4 chunks which all ended up in +the same group. + +From the information above we can sort the groups by the number of groups we'd +have to hold out if we were to put that group in the heldout set. Then if, say, +--heldout-data-selection-proportion=0.2, we take the bottom 20% of groups by +this measure, meaning the groups which will cause less training data to have to +be held out. This is the set from which we'll select the heldout data and the +matched subset of training data. Call this the "candidate set". We first +choose --num-heldout-groups groups from the candidate set. This is the heldout +subset. From the heldout subset we compute the set of "uniq" values represented, +and we remove from the training set any groups which share those "uniq" values. + +Next we need to choose the matched subset of training examples. The way we do +this is that we choose --num-heldout-groups from the "candidate set", after +excluding groups that were in the heldout subset or which were removed from the +training set because they contained "uniq" values in common with those in the +heldout set. If this fails because there were too few groups in the candidate +set, just double --heldout-data-selection-proportion and retry. Make sure to do +something sensible in the case where the dataset is too tiny to choose the +requested heldout set size (i.e. print an informative error message before +dying). """ From b96e80c84902a6817068115e82ad4e4e574cb59d Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 4 Jan 2019 15:05:17 -0800 Subject: [PATCH 55/87] [scripts] Further progress on chaina scripts --- .../chaina/internal/choose_egs_to_merge.py | 19 ++- egs/wsj/s5/steps/chaina/process_egs.sh | 158 ++++++++++++++++++ 2 files changed, 169 insertions(+), 8 deletions(-) create mode 100755 egs/wsj/s5/steps/chaina/process_egs.sh diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py index 886ca1d974f..60a56a5bca9 100755 --- a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -28,14 +28,14 @@ "means the number of chunks per group of chunks, and they are " "only preferentially taken from the same speaker.") parser.add_argument("--num-repeats", type=int, default=1, - "The number of times the data is to be repeated. Must divide " + help="The number of times the data is to be repeated. Must divide " "--chunks-per-spk. Suggest to try only 1 or 2. The idea " "is to divide chunks into groups in different ways, to give " "more variety to the egs (since the adaptation information " "will differ.") parser.add_argument("--heldout-data-selection-proportion", type=float, default=0.2, - "This parameter governs the selection of the heldout " + help="This parameter governs the selection of the heldout " "subset and the statistically matched training subset. " "It does not affect the size of that subset, but only " "affects what pool the examples are drawb from. " @@ -49,11 +49,11 @@ "gives a selection that's statistically more " "representative).") parser.add_argument("--num-heldout-groups", type=int, default=200, - "Number of utterance groups " + help="Number of utterance groups " "that will go in the heldout subset (and in the " "statistically matched training subset)") parser.add_argument("--utt2uniq", type=str, default='', - "File used in setups with data " + help="File used in setups with data " "augmentation, that maps from utterance-ids to the " "pre-augmentation utterance-id. The reason it's needed " "is to ensure that the heldout set is properly held " @@ -61,7 +61,7 @@ "weren't trained on. If not specified, we assume the " "identity map.") parser.add_argument("--scp-in", type=str, required=True, - "The scp file in, likely containing chain egs. The " + help="The scp file in, likely containing chain egs. The " "keys are expected to be of the form: " "'-----v1', " "where the left_context, num_frames and right_context are required to be the " @@ -71,18 +71,21 @@ "order to prevent this constraint from splitting up the utterances from " "a single speaker") parser.add_argument("--training-data-out", type=str, required=True, - "The output file containing the chunks that are to be grouped; each " + help="The output file containing the chunks that are to be grouped; each " "line will contain --chunks-per-spk (e.g. 4) rxfilenames, obtained " "from the second field of the input --scp-in file.") parser.add_argument("--heldout-subset-out", type=str, required=True, - "This is the name of the file to which the heldout data subset " + help="This is the name of the file to which the heldout data subset " "will be written; the format is the same as --training-data-out.") parser.add_argument("--training-subset-out", type=str, required=True, - "This is the name of the file to which the statistically matched " + help="This is the name of the file to which the statistically matched " "(to --heldout-subset-out) set of training data will be written") +args = parser.parse_args() +# TODO: please print the command line to stderr for logging purposes. +# Any useful debugging messages can go to stderr too. """ Notes on plan for how to implement this (we can keep this as documentation, but diff --git a/egs/wsj/s5/steps/chaina/process_egs.sh b/egs/wsj/s5/steps/chaina/process_egs.sh new file mode 100755 index 00000000000..d898ea1f4d2 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/process_egs.sh @@ -0,0 +1,158 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script takes nnet examples dumped by steps/chaina/get_raw_egs.sh and +# combines the chunks into groups by speaker (to the extent possible; it may +# need to combine speakers in some cases), locally randomizes the result, and +# dumps the resulting egs to disk. Chunks of these will later be globally +# randomized (at the scp level) by steps/chaina/randomize_egs.sh + + +# Begin configuration section. +cmd=run.pl +chunks_per_spk=4 +num_repeats=2 # number of times we repeat the same chunks with different + # grouping. Recommend 1 or 2; must divide chunks_per_spk +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + + +num_heldout_groups=200 # The number of groups (i.e. groups of chunks) that + # will go in the held-out set and the train subset + # (heldout_subset.scp and train_subset.scp). The real + # point of train_subset.scp, and the reason we can't + # just use a subset of train.scp, is that it contains + # egs that are statistically comparable to + # heldout_subset.scp, so their prob can be + # meaningfully compared with those from + # heldout_subset.scp. Note: the number (e.g. 200) is + # *after* merging chunks into groups of size + # $chunks_per_spk. + + +shuffle_buffer_size=5000 # Size of buffer (containing grouped egs) to use + # for random shuffle. + +stage=0 +nj=5 # the number of parallel jobs to run. +srand=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 --chunks-per-spk 4 exp/chaina/tdnn1a_sp/raw_egs exp/chaina/tdnn1a_sp/processed_egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options (alternative to this" + echo " # command line)" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --chunks-per-spk # Number of chunks (preferentially, from a single speaker" + echo " # to combine into each example. This grouping of" + echo " # egs is part of the 'chaina' framework; the adaptation" + echo " # parameters will be estimated from these groups." + echo " --num-repeats # Number of times we group the same chunks into different" + echo " # groups. For now only the values 1 and 2 are" + echo " # recommended, due to the very simple way we choose" + echo " # the groups (it's consecutive)." + echo " --nj # Number of jobs to run in parallel. Usually quite a" + echo " # small number, as we'll be limited by disk access" + echo " # speed." + echo " --compress # True if you want the egs to be compressed" + echo " # (e.g. you may set to false for debugging purposes, to" + echo " # check that the compression is not hurting)." + echo " --num-heldout-egs # Number of egs to put in train_subset.scp and heldout_subset.scp." + echo " # These will be used for diagnostics. Note: this number is" + echo " # the number of grouped egs, after merging --chunks-per-spk" + echo " # chunks into a single eg." + echo " # ... may be a comma separated list, but we advise a single" + echo " # number in most cases, due to interaction with the need " + echo " # to group egs from the same speaker into groups." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + exit 1; +fi + +raw_egs_dir=$1 +dir=$2 + +# die on error or undefined variable. +set -e -u + +for f in $raw_egs_dir/all.scp $raw_egs_dir/info.txt $raw_egs_dir/misc/utt2spk; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist." + exit 1 + fi +done + +if ! awk '/dir_type /{if ($2 != "raw_chaina_dir") exit(1); }'; then + echo "$0: input directory $raw_egs_dir does not seem to be of the right type." +fi + + + +mkdir -p $dir/temp $dir/log + + +if [ $stage -le 0 ]; then + echo "$0: choosing egs to merge" + + utt2uniq_opt= + [ -f $raw_egs_dir/misc/utt2uniq ] && utt2uniq_opt="--utt2uniq=$raw_egs_dir/misc/utt2uniq" + + $cmd $dir/log/choose_egs_to_merge.log steps/chaina/internal/choose_egs_to_merge.py \ + --chunks-per-spk=$chunks_per_spk \ + --num-repeats=$num_repeats \ + --num-heldout-groups=$num_heldout_groups \ + $utt2uniq_opt \ + --scp-in=$raw_egs_dir/all.scp \ + --training-data-out=$dir/temp/train.list \ + --heldout-subset-out=$dir/temp/heldout_subset.list \ + --training-subset-out=$dir/temp/train_subset.list +fi + +if [ $stage -le 1 ]; then + + + for name in heldout_subset train_subset; do + echo "$0: merging and shuffling $train egs" + + # Linearize these lists and add keys to make it an scp format. + awk '{for (n=1;n<=NF;n++) { count++; print count "-" $n; }' <$dir/temp/${name}.list >$dir/temp/${name}.scp + + $cmd $dir/log/merge_${name}_egs.log \ + nnet3-chain-merge-egs --compress=$compress scp:$dir/temp/${name}.scp ark:- \| \ + nnet3-chain-shuffle-egs --srand=$srand $ark:- ark,scp:$dir/${name}.ark,$dir/${name}.scp + done + + # Split up the training list into multiple smaller lists, as it could be long. + utils/split_scp.pl $dir/train.list $(for j in $(seq $nj); do echo $dir/temp/train.$j.list; done) + for j in $(seq $nj); do + awk '{for (n=1;n<=NF;n++) { count++; print count "-" $n; }' <$dir/temp/train.$j.list >$dir/temp/train.$j.scp + done + + if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for j in $(seq $nj); do echo $dir/train.$j.ark; done) + fi + + + $cmd JOB=1:$nj $dir/log/merge_train_egs.JOB.log \ + nnet3-chain-merge-egs --compress=$compress scp:$dir/temp/train.JOB.scp ark:- \| \ + nnet3-chain-shuffle-egs --shuffle-buffer-size=$shuffle_buffer_size \ + --srand=\$[JOB+$srand] ark:- ark,scp:$dir/train.JOB.ark,$dir/train.JOB.scp + + cat $(for j in $(seq $nj); do echo $dir/train.$j.scp; done) > $dir/train.scp +fi + + + +echo "$0: Finished processing egs" From f6087c9bd93cf1e0afdb6e8b2f7a9b5d132bc1da Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 5 Jan 2019 19:59:58 -0800 Subject: [PATCH 56/87] [src,scripts,egs] Further progress; some renaming. --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 4 +- egs/wsj/s5/steps/chaina/get_raw_egs.sh | 5 +- .../chaina/internal/choose_egs_to_merge.py | 14 +- egs/wsj/s5/steps/chaina/process_egs.sh | 75 ++++--- egs/wsj/s5/steps/chaina/randomize_egs.sh | 185 ++++++++++++++++++ .../s5/steps/chaina/validate_processed_egs.sh | 48 +++++ egs/wsj/s5/steps/chaina/validate_raw_egs.sh | 46 +++++ src/nnet3/nnet-chain-example.cc | 26 +-- src/nnet3/nnet-chain-example.h | 9 +- src/nnet3a/nnet-chaina-training.cc | 16 +- src/nnet3a/nnet-chaina-training.h | 9 +- 11 files changed, 374 insertions(+), 63 deletions(-) create mode 100755 egs/wsj/s5/steps/chaina/randomize_egs.sh create mode 100755 egs/wsj/s5/steps/chaina/validate_processed_egs.sh create mode 100755 egs/wsj/s5/steps/chaina/validate_raw_egs.sh diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 0ed32ea20aa..33440cd9495 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -39,7 +39,7 @@ egs_extra_right_context=5 # The number of chunks (of length: see $chunk_width above) that we group # together for each "speaker" (actually: pseudo-speaker, since we may have # to group multiple speaker together in some cases). -chunks_per_spk=4 +chunks_per_group=4 # End configuration section. @@ -299,7 +299,7 @@ fi if [ $stage -le 19 ]; then echo "$0: about to process egs" steps/chaina/process_egs.sh \ - --chunks-per-spk ${chunks_per_spk} ${dir}/raw_egs ${dir}/processed_egs + --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs fi diff --git a/egs/wsj/s5/steps/chaina/get_raw_egs.sh b/egs/wsj/s5/steps/chaina/get_raw_egs.sh index f0aab6e2e55..50139a86208 100755 --- a/egs/wsj/s5/steps/chaina/get_raw_egs.sh +++ b/egs/wsj/s5/steps/chaina/get_raw_egs.sh @@ -113,7 +113,7 @@ if [ $# != 4 ]; then echo " # supervision lattices are to be interpreted. Affects pruning" echo " --deriv-weights-scp # If supplied, adds per-frame weights to the supervision." echo " # (e.g., might be relevant for unsupervised training)." - echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " --stage # Used to run this script from somewhere in" echo " # the middle." exit 1; fi @@ -124,7 +124,8 @@ latdir=$3 dir=$4 tree=$chaindir/${lang}.tree -trans_mdl=$chaindir/0/${lang}.mdl # contains the transition model and a nnet. +trans_mdl=$chaindir/0/${lang}.mdl # contains the transition model and a nnet, but + # we won't be making use of the nnet part. normalization_fst=$chaindir/0/${lang}.normalization.fst den_fst=$chaindir/0/${lang}.den.fst diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py index 60a56a5bca9..6d705de4bdf 100755 --- a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -12,24 +12,24 @@ parser = argparse.ArgumentParser(description="Chooses groups of examples to merge into groups " - "of size given by the --chunks-per-spk option, based on speaker " + "of size given by the --chunks-per-group option, based on speaker " "information (preferentially, chunks from the same utterance " "and, if possible, the same speaker, get combined into " "groups). This script also computes a held-out subset of...", epilog="E.g. " + sys.argv[0] + "*** TODO *** ", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -# Also maybe have --num-repeats, which must divide --chunks-per-spk? Can be +# Also maybe have --num-repeats, which must divide --chunks-per-group? Can be # used to divide data into different groups than the default ones. -parser.add_argument("--chunks-per-spk", type=int, default=4, +parser.add_argument("--chunks-per-group", type=int, default=4, help="Number of chunks per speaker in the final egs (actually " "means the number of chunks per group of chunks, and they are " "only preferentially taken from the same speaker.") parser.add_argument("--num-repeats", type=int, default=1, help="The number of times the data is to be repeated. Must divide " - "--chunks-per-spk. Suggest to try only 1 or 2. The idea " + "--chunks-per-group. Suggest to try only 1 or 2. The idea " "is to divide chunks into groups in different ways, to give " "more variety to the egs (since the adaptation information " "will differ.") @@ -72,7 +72,7 @@ "a single speaker") parser.add_argument("--training-data-out", type=str, required=True, help="The output file containing the chunks that are to be grouped; each " - "line will contain --chunks-per-spk (e.g. 4) rxfilenames, obtained " + "line will contain --chunks-per-group (e.g. 4) rxfilenames, obtained " "from the second field of the input --scp-in file.") parser.add_argument("--heldout-subset-out", type=str, required=True, help="This is the name of the file to which the heldout data subset " @@ -105,10 +105,10 @@ of --. In the normal case there will be just one such sub-list. -In the case where --chunks-per-spk=4 and --num-repeats=1, the groups of +In the case where --chunks-per-group=4 and --num-repeats=1, the groups of chunks would then just be (and we do this for each of the sub-lists): the first 4 chunks; the second 4 chunks; and so on. In the case where ---chunks-per-spk=4 and --num-repeats=2, we'd obtain the groups as above, then +--chunks-per-group=4 and --num-repeats=2, we'd obtain the groups as above, then we'd discard the first 2 chunks of each sub-list and repeat the process, giving us twice the original number of groups. If you want you can just assert that --num-repeats is either 1 or 2 for now; higher values don't diff --git a/egs/wsj/s5/steps/chaina/process_egs.sh b/egs/wsj/s5/steps/chaina/process_egs.sh index d898ea1f4d2..41232a41972 100755 --- a/egs/wsj/s5/steps/chaina/process_egs.sh +++ b/egs/wsj/s5/steps/chaina/process_egs.sh @@ -11,9 +11,9 @@ # Begin configuration section. cmd=run.pl -chunks_per_spk=4 +chunks_per_group=4 num_repeats=2 # number of times we repeat the same chunks with different - # grouping. Recommend 1 or 2; must divide chunks_per_spk + # grouping. Recommend 1 or 2; must divide chunks_per_group compress=true # set this to false to disable compression (e.g. if you want to see whether # results are affected). @@ -28,7 +28,7 @@ num_heldout_groups=200 # The number of groups (i.e. groups of chunks) that # meaningfully compared with those from # heldout_subset.scp. Note: the number (e.g. 200) is # *after* merging chunks into groups of size - # $chunks_per_spk. + # $chunks_per_group. shuffle_buffer_size=5000 # Size of buffer (containing grouped egs) to use @@ -46,13 +46,13 @@ if [ -f path.sh ]; then . ./path.sh; fi if [ $# != 2 ]; then echo "Usage: $0 [opts] " - echo " e.g.: $0 --chunks-per-spk 4 exp/chaina/tdnn1a_sp/raw_egs exp/chaina/tdnn1a_sp/processed_egs" + echo " e.g.: $0 --chunks-per-group 4 exp/chaina/tdnn1a_sp/raw_egs exp/chaina/tdnn1a_sp/processed_egs" echo "" echo "Main options (for others, see top of script file)" echo " --config # config file containing options (alternative to this" echo " # command line)" echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." - echo " --chunks-per-spk # Number of chunks (preferentially, from a single speaker" + echo " --chunks-per-group # Number of chunks (preferentially, from a single speaker" echo " # to combine into each example. This grouping of" echo " # egs is part of the 'chaina' framework; the adaptation" echo " # parameters will be estimated from these groups." @@ -68,12 +68,12 @@ if [ $# != 2 ]; then echo " # check that the compression is not hurting)." echo " --num-heldout-egs # Number of egs to put in train_subset.scp and heldout_subset.scp." echo " # These will be used for diagnostics. Note: this number is" - echo " # the number of grouped egs, after merging --chunks-per-spk" + echo " # the number of grouped egs, after merging --chunks-per-group" echo " # chunks into a single eg." echo " # ... may be a comma separated list, but we advise a single" echo " # number in most cases, due to interaction with the need " echo " # to group egs from the same speaker into groups." - echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " --stage # Used to run this script from somewhere in" echo " # the middle." exit 1; fi @@ -84,19 +84,12 @@ dir=$2 # die on error or undefined variable. set -e -u -for f in $raw_egs_dir/all.scp $raw_egs_dir/info.txt $raw_egs_dir/misc/utt2spk; do - if [ ! -f $f ]; then - echo "$0: expected file $f to exist." - exit 1 - fi -done - -if ! awk '/dir_type /{if ($2 != "raw_chaina_dir") exit(1); }'; then - echo "$0: input directory $raw_egs_dir does not seem to be of the right type." +if ! steps/chaina/validate_raw_egs_dir $raw_egs_dir; then + echo "$0: failed to validate input directory $raw_egs_dir" + exit 1 fi - mkdir -p $dir/temp $dir/log @@ -107,7 +100,7 @@ if [ $stage -le 0 ]; then [ -f $raw_egs_dir/misc/utt2uniq ] && utt2uniq_opt="--utt2uniq=$raw_egs_dir/misc/utt2uniq" $cmd $dir/log/choose_egs_to_merge.log steps/chaina/internal/choose_egs_to_merge.py \ - --chunks-per-spk=$chunks_per_spk \ + --chunks-per-group=$chunks_per_group \ --num-repeats=$num_repeats \ --num-heldout-groups=$num_heldout_groups \ $utt2uniq_opt \ @@ -119,7 +112,6 @@ fi if [ $stage -le 1 ]; then - for name in heldout_subset train_subset; do echo "$0: merging and shuffling $train egs" @@ -127,12 +119,16 @@ if [ $stage -le 1 ]; then awk '{for (n=1;n<=NF;n++) { count++; print count "-" $n; }' <$dir/temp/${name}.list >$dir/temp/${name}.scp $cmd $dir/log/merge_${name}_egs.log \ - nnet3-chain-merge-egs --compress=$compress scp:$dir/temp/${name}.scp ark:- \| \ - nnet3-chain-shuffle-egs --srand=$srand $ark:- ark,scp:$dir/${name}.ark,$dir/${name}.scp + nnet3-chain-merge-egs --minibatch-size=$chunks_per_group --compress=$compress \ + scp:$dir/temp/${name}.scp ark:- \| \ + nnet3-chain-shuffle-egs --srand=$srand $ark:- ark,scp:$dir/${name}.ark,$dir/${name}.scp done # Split up the training list into multiple smaller lists, as it could be long. utils/split_scp.pl $dir/train.list $(for j in $(seq $nj); do echo $dir/temp/train.$j.list; done) + # Linearize these lists and add keys to make them in scp format; + # nnet3-chain-merge-egs will merge the right groups, it's deterministic + # and we specified --minibatch-size=$chunks_per_group. for j in $(seq $nj); do awk '{for (n=1;n<=NF;n++) { count++; print count "-" $n; }' <$dir/temp/train.$j.list >$dir/temp/train.$j.scp done @@ -146,13 +142,46 @@ if [ $stage -le 1 ]; then $cmd JOB=1:$nj $dir/log/merge_train_egs.JOB.log \ - nnet3-chain-merge-egs --compress=$compress scp:$dir/temp/train.JOB.scp ark:- \| \ + nnet3-chain-merge-egs --compress=$compress --minibatch-size=$chunks_per_group \ + scp:$dir/temp/train.JOB.scp ark:- \| \ nnet3-chain-shuffle-egs --shuffle-buffer-size=$shuffle_buffer_size \ --srand=\$[JOB+$srand] ark:- ark,scp:$dir/train.JOB.ark,$dir/train.JOB.scp - cat $(for j in $(seq $nj); do echo $dir/train.$j.scp; done) > $dir/train.scp + # the awk command is to ensure unique ids for each group. + cat $(for j in $(seq $nj); do echo $dir/train.$j.scp; done) | awk '{printf("%09d %s\n", NR, $2);}' > $dir/train.scp +fi + + +cat $raw_egs_dir/info.txt | awk -v num_repeats=$num_repeats \ + -v chunks_per_group=$chunks_per_group ' + /^dir_type/ { print "dir_type processed_chaina_egs"; next; } + /^num_input_frames/ { print $2 * num_repeats; next; } # approximate; ignores held-out egs. + {print;} + END{print "chunks_per_group " chunks_per_group; print "num_repeats " num_repeats;}' >$dir/info.txt + +# # Note: the info.txt will actually look like the following, in general, +# # taking into account the fields present in the info.txt in the source dir: +# dir_type processed_chaina_egs +# num_input_frames $num_frames +# num_chunks $num_chunks +# lang $lang +# feat_dim $feat_dim +# num_leaves $num_leaves +# frames_per_chunk $frames_per_chunk +# frames_per_chunk_avg $frames_per_chunk_avg +# left_context $left_context +# left_context_initial $left_context_initial +# right_context $right_context +# right_context_final $right_context_final +# chunks_per_group $chunks_per_group + + +if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then + echo "$0: we failed to obtain at least one of the fields in $dir/info.txt" + exit 1 fi +cp -r $raw_egs_dir/misc/ $dir/ echo "$0: Finished processing egs" diff --git a/egs/wsj/s5/steps/chaina/randomize_egs.sh b/egs/wsj/s5/steps/chaina/randomize_egs.sh new file mode 100755 index 00000000000..37e1aaaa372 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/randomize_egs.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script takes nnet examples dumped by steps/chaina/process_egs.sh, +# globally randomizes the egs, and divides into multiple .scp files. This is +# the form of egs which is consumed by the training script. All this is done +# only by manipulating the contents of .scp files. To keep locality of disk +# access, we only randomize blocks of egs (e.g. blocks containing 128 groups of +# sequences). This doesn't defeat randomization, because both process_egs.sh +# and the training script use nnet3-shuffle-egs to do more local randomization. + +# Later on, we'll have a multilingual/multi-input-dir version fo this script +# that combines egs from various data sources and possibly multiple languages. +# This version assumes there is just one language. + +# Begin configuration section. +cmd=run.pl + +groups_per_block=128 # The 'groups' are the egs in the scp file from + # process_egs.sh, containing '--chunks-per-group' sequences + # each. + +frames_per_job=3000000 # The number of frames of data we want to process per + # training job (will determine how long each job takes, + # and the frequency of model averaging. This was + # previously called --frames-per-iter, but + # --frames-per-job is clearer as each job does this + # many. + +num_groups_combine=1000 # the number of groups from the training set that we + # randomly choose as input to nnet3-chain-combine; + # these will go to combine.scp. train_subset.scp and + # heldout_subset.scp are, for now, just copied over + # from the input. + +# Later we may provide a mechanism to change the language name; for now we +# just copy it from the input. + + +srand=0 +stage=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 --frames-per-job 200000 exp/chaina/tdnn1a_sp/processed_egs exp/chaina/tdnn1a_sp/egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options (alternative to this" + echo " # command line)" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --groups-per-block # The number of groups (i.e. previously merged egs" + echo " # containing --chunks-per-group chunks) to to consider " + echo " # as one block, where whole blocks are randomized;" + echo " # smaller means more complete randomization but less" + echo " # local disk access." + echo " --frames-per-job # The number of input frames (not counting context)" + echo " # that we aim to have in each scp file after" + echo " # randomization and splitting." + echo " --num-groups-combine # The number of randomly chosen groups to" + echo " # put in the subset in 'combine.scp' which will" + echo " # be used in nnet3-chaina-combine to decide which" + echo " # models to average over." + echo " --stage # Used to run this script from somewhere in" + echo " # the middle." + echo " --srand # Random seed, affects randomization." + exit 1; +fi + +processed_egs_dir=$1 +dir=$2 + +# die on error or undefined variable. +set -e -u + +for f in train.scp heldout_subset.scp train_subset.scp info.txt; do + if [ ! -f $processed_egs_dir/$f ]; then + echo "$0: expected file $processed_egs_dir/$f to exist." + exit 1 + fi +done + +if ! awk '/dir_type /{if ($2 != "processed_chaina_dir") exit(1); }' <$processed_egs_dir/info.txt; then + echo "$0: input directory $processed_egs_dir does not seem to be of the right type." +fi + + +# Work out how many groups per job and how many frames per job we'll have + +frames_per_group_avg=$(awk '/^frames_per_chunk_avg/ { fpc=$2; } /^chunks_per_group/ { print int(fpc * $2); }') +if ! [ $frames_per_group_avg -gt 0 ]; then + echo "$0: error getting frames per group."; +fi + +num_groups=$(wc -l <$processed_egs_dir/train.scp) + +num_scp_files=$[[ (frames_per_group_avg + frames_per_job / 2) / frames_per_job ]] +[ $num_scp_files -eq 0 ] && num_scp_files=1 + +frames_per_scp_file=$[[(frames_per_group_avg * num_groups) / num_scp_files]] +groups_per_scp_file=$[[ num_groups / num_scp_files]] + + +mkdir -p $dir/temp + +if [ -d $dir/misc ]; then + rm -r $dir/misc +fi + +mkdir -p $dir/misc +cp $processed_egs_dir/misc/* $dir/misc + + +# We want to globally randomize the order of these blocks of (e.g.) 128 lines of +# the input train.scp, and then split up into $num_scp_files groups. we could +# do this in a specially-written python script, but instead we do it with a +# combination of existing Kaldi and UNIX utilities. + +awk '{block=sprintf("%05d", NR / groups_per_block); group_id=$1; print group_id, block;}' \ + <$processed_egs_dir/train.scp >$dir/temp/key2block + +# get list of blocks +awk '{print $2}' | uniq <$dir/temp/key2block > $dir/temp/blocks +# get randomized-order list of blocks +utils/shuffle_list.pl --srand "$srand" <$dir/temp/blocks > $dir/temp/blocks_rand +# Map block-ids to randomized-order block-ids +paste $dir/temp/blocks $dir/temp/blocks_rand > $dir/temp/block2rand + + +# The following command first maps block-ids to randomized-order block-ids, then +# sorts the keys by these randomized-order block-ids while otherwise maintaining +# stable sorting (-s) which keeps the keys in the blocks in the same order. +utils/apply_map.pl -f 2 $dir/temp/block2rand <$dir/temp/key2block | \ + sort -k2 -s > $dir/temp/key2block_rand + + +# The following command just changes the order of train.scp to +# match the order in key2block_rand (which has the order of blocks +# of lines randomly moved around). +awk '{print $1, $1}' $dir/temp/key2block_rand | \ + utils/apply_map.pl $processed_egs_dir/train.scp \ + >$dir/temp/train.scp_rand + + +# The following command splits up $dir/temp/train.scp_rand (the randomized-order +# version of train.scp), while keeping distinct blocks in separate scp files, +# thanks to the --utt2spk option. +utils/split_scp.pl --utt2spk=$dir/temp/key2block_rand \ + $dir/temp/train.scp_rand \ + $(for i in $(seq $num_scp_files); do echo $dir/train.$i.scp; done) + + +cp $processed_egs_dir/heldout_subset.scp $processed_egs_dir/train_subset.scp $dir/ + + + +cat $processed_egs_dir/info.txt | awk ' + /^dir_type/ { print "dir_type processed_chaina_egs"; next; } + /^lang / { print "langs", $2; next } + /^num_input_frames/ { print $2 * num_repeats; next; } # approximate; ignores held-out egs. + {print;} + END{print "chunks_per_group " chunks_per_group; print "num_repeats " num_repeats;}' >$dir/info.txt + +cat <>$dir/info.txt +num_scp_files $num_scp_files +frames_per_scp_file $frames_per_scp_file +groups_per_scp_file $groups_per_scp_file +EOF + +# Note: frame_per_job, after rounding, becomes frames_per_scp_file. + + +if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then + echo "$0: we failed to obtain at least one of the fields in $dir/info.txt" + exit 1 +fi + + +echo "$0: Finished randomizing egs" diff --git a/egs/wsj/s5/steps/chaina/validate_processed_egs.sh b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh new file mode 100755 index 00000000000..d472484a035 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script validates a directory containing 'raw' egs for 'chaina' training. +# It also helps to document the expectations on such a directory. + + +if [ -f path.sh ]; then . ./path.sh; fi + + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/chaina/tdnn1a_sp/processed_egs" + echo "" + echo "Validates that the processed-egs dir has the expected format" +fi + +dir=$1 + +# Note: the .ark files are not actually consumed directly downstream (only via +# the top-level .scp files), but we check them anyway for now. +for f in $dir/train.scp $dir/info.txt \ + $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp} \ + $dir/train.1.scp $dir/train.1.ark; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + + +if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "processed_chaina_egs" ]; then + grep dir_type $dir/info.txt + echo "$0: dir_type should be processed_chaina_egs in $dir/info.txt" + exit 1 +fi + +lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) + +for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + +echo "$0: sucessefully validated raw egs in $dir" diff --git a/egs/wsj/s5/steps/chaina/validate_raw_egs.sh b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh new file mode 100755 index 00000000000..e2a29f96b55 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script validates a directory containing 'raw' egs for 'chaina' training. +# It also helps to document the expectations on such a directory. + + + +if [ -f path.sh ]; then . ./path.sh; fi + + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/chaina/tdnn1a_sp/raw_egs" + echo "" + echo "Validates that the raw-egs dir has the expected format" +fi + +dir=$1 + +for f in $dir/all.scp $dir/cegs.1.ark $dir/info.txt \ + $dir/misc/utt2spk; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + + +if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "raw_chaina_egs" ]; then + grep dir_type $dir/info.txt + echo "$0: dir_type should be raw_chaina_egs in $dir/info.txt" + exit 1 +fi + +lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) + +for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + +echo "$0: sucessefully validated raw egs in $dir" diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index e5aa13b848b..9196feb5d15 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -33,9 +33,9 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const { supervision.Write(os, binary); WriteToken(os, binary, ""); deriv_weights.Write(os, binary); - if (chunks_per_spk != 1) { - WriteToken(os, binary, ""); - WriteBasicType(os, binary, chunks_per_spk); + if (chunks_per_group != 1) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, chunks_per_group); } WriteToken(os, binary, ""); } @@ -44,7 +44,7 @@ bool NnetChainSupervision::operator == (const NnetChainSupervision &other) const return name == other.name && indexes == other.indexes && supervision == other.supervision && deriv_weights.ApproxEqual(other.deriv_weights) && - chunks_per_spk == other.chunks_per_spk; + chunks_per_group == other.chunks_per_group; } void NnetChainSupervision::Read(std::istream &is, bool binary) { @@ -57,10 +57,10 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); deriv_weights.Read(is, binary); if (PeekToken(is, binary) == 'C') { - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &chunks_per_spk); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &chunks_per_group); } else { - chunks_per_spk = 1; + chunks_per_group = 1; } ExpectToken(is, binary, ""); CheckDim(); @@ -80,8 +80,8 @@ void NnetChainSupervision::CheckDim() const { frame_skip = indexes[supervision.num_sequences].t - first_frame, num_sequences = supervision.num_sequences, frames_per_sequence = supervision.frames_per_sequence; - KALDI_ASSERT(chunks_per_spk > 0 && - num_sequences % chunks_per_spk == 0); + KALDI_ASSERT(chunks_per_group > 0 && + num_sequences % chunks_per_group == 0); int32 k = 0; for (int32 i = 0; i < frames_per_sequence; i++) { for (int32 j = 0; j < num_sequences; j++,k++) { @@ -101,14 +101,14 @@ NnetChainSupervision::NnetChainSupervision(const NnetChainSupervision &other): indexes(other.indexes), supervision(other.supervision), deriv_weights(other.deriv_weights), - chunks_per_spk(other.chunks_per_spk) { CheckDim(); } + chunks_per_group(other.chunks_per_group) { CheckDim(); } void NnetChainSupervision::Swap(NnetChainSupervision *other) { name.swap(other->name); indexes.swap(other->indexes); supervision.Swap(&(other->supervision)); deriv_weights.Swap(&(other->deriv_weights)); - std::swap(chunks_per_spk, other->chunks_per_spk); + std::swap(chunks_per_group, other->chunks_per_group); if (RandInt(0, 5) == 0) CheckDim(); } @@ -122,7 +122,7 @@ NnetChainSupervision::NnetChainSupervision( name(name), supervision(supervision), deriv_weights(deriv_weights), - chunks_per_spk(1) { + chunks_per_group(1) { // note: this will set the 'x' index to zero. indexes.resize(supervision.num_sequences * supervision.frames_per_sequence); @@ -268,7 +268,7 @@ static void MergeSupervision( } } } - output->chunks_per_spk = example_stride; + output->chunks_per_group = example_stride; output->CheckDim(); } diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index f96dc81369f..eb6846fa4d2 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -81,10 +81,11 @@ struct NnetChainSupervision { /// This will be 1 in normal cases, but in the 'chaina' code (chain training /// with adaptation) it will be set to the number of chunks/sequences per - /// speaker in this minibatch. For example if it's 4, then we are asserting - /// that sequences n=0 through 3 all come from the same speaker, n=4 through 7 - /// all come from the same speaker, and so on. - int32 chunks_per_spk; + /// group in this minibatch (the chunks from a particular group are expected + /// to come from the same speaker). For example if it's 4, then we are + /// asserting that sequences n=0 through 3 all come from the same speaker, n=4 + /// through 7 all come from the same speaker, and so on. + int32 chunks_per_group; // Use default assignment operator diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 3c13043b40c..c91ac3863d4 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -543,7 +543,7 @@ bool NnetChainaTopTrainer::TrainAdapted( bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, int32 num_sequences, - int32 num_spk, + int32 num_groups, int32 first_input_t, int32 top_subsampling_factor, const VectorBase &deriv_weights_in, @@ -604,7 +604,7 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, using namespace differentiable_transform; MinibatchInfoItf *minibatch_info = transform_.transform->TrainingForward( - input, num_sequences, num_spk, post_padded, &adapted_input); + input, num_sequences, num_groups, post_padded, &adapted_input); success = TrainAdapted( *computation_adapted, supervision, @@ -619,7 +619,7 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, delete minibatch_info; else transform_.transform->TrainingBackward(input, adapted_input_deriv, - num_sequences, num_spk, post_padded, + num_sequences, num_groups, post_padded, minibatch_info, input_deriv); return true; } @@ -977,17 +977,17 @@ void NnetChainaTrainer::Train(const std::string &key, if (opts_.top_model_test_mode) top_weight = 0.0; - int32 num_sequences, chunks_per_spk, first_input_t, + int32 num_sequences, chunks_per_group, first_input_t, num_input_frames, num_output_frames, frame_subsampling_factor, eg_left_context, eg_right_context; - FindChainaExampleStructure(eg, &num_sequences, &chunks_per_spk, + FindChainaExampleStructure(eg, &num_sequences, &chunks_per_group, &first_input_t, &num_input_frames, &num_output_frames, &frame_subsampling_factor, &eg_left_context, &eg_right_context); - KALDI_ASSERT(chunks_per_spk % num_sequences == 0); - int32 num_spk = num_sequences / chunks_per_spk; + KALDI_ASSERT(chunks_per_group % num_sequences == 0); + int32 num_groups = num_sequences / chunks_per_group; AmNnetSimple *top_am_nnet = models_->GetNnetForLang(lang_name); int32 top_left_context = top_am_nnet->LeftContext(), @@ -1029,7 +1029,7 @@ void NnetChainaTrainer::Train(const std::string &key, bool success = top_trainer->Train(cu_embedding, num_sequences, - num_spk, + num_groups, first_embedding_t_subsampled, top_subsampling_factor, eg.outputs[0].deriv_weights, diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index 0c2a7e2073c..60579eeafff 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -311,9 +311,10 @@ class NnetChainaTopTrainer { in 'input' (a.k.a. the minibatch size). Actually this must be equal to supervision.num_sequences, but it's easier for reasons of clarity and documentation to repeat it here. - @param [in] num_spk The total number of speakers. Must be >1, and must divide + @param [in] num_groups The total number of groups of chunks (you + can think of these as the same as speakers). Must be >1, and must divide num_sequences. The number of sequences per speaker - must be the same for all speakers (it will equal num_sequences / num_spk), + must be the same for all speakers (it will equal num_sequences / num_groups), and the sequences for a speaker must be consecutively numbered. @param [in] first_input_t The 't' value corresponding to the first input frame (will normally be a negative number, @@ -353,7 +354,7 @@ class NnetChainaTopTrainer { */ bool Train(const CuMatrixBase &input, int32 num_sequences, - int32 num_spk, + int32 num_groups, int32 first_input_t, int32 top_subsampling_factor, const VectorBase &deriv_weights, @@ -851,7 +852,7 @@ class NnetChainaTrainer { @param [in] eg The example we are training on. It is expected to have an input named 'input' (the features) and an output named 'output' (containing the chain supervision - object). We'll make use of the chunks_per_spk member + object). We'll make use of the chunks_per_group member of the NnetChainSupervision object, which is not used outside the 'chaina' framework. */ From 08fdf02c76974c70a86ffb4dedd2a9901831caf2 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 5 Jan 2019 23:04:12 -0800 Subject: [PATCH 57/87] [scripts] Further progress on validation scripts --- egs/wsj/s5/steps/chaina/randomize_egs.sh | 17 ++--- .../s5/steps/chaina/validate_processed_egs.sh | 7 +- .../steps/chaina/validate_randomized_egs.sh | 65 +++++++++++++++++++ egs/wsj/s5/steps/chaina/validate_raw_egs.sh | 4 +- 4 files changed, 76 insertions(+), 17 deletions(-) create mode 100755 egs/wsj/s5/steps/chaina/validate_randomized_egs.sh diff --git a/egs/wsj/s5/steps/chaina/randomize_egs.sh b/egs/wsj/s5/steps/chaina/randomize_egs.sh index 37e1aaaa372..878c3d31001 100755 --- a/egs/wsj/s5/steps/chaina/randomize_egs.sh +++ b/egs/wsj/s5/steps/chaina/randomize_egs.sh @@ -79,18 +79,11 @@ dir=$2 # die on error or undefined variable. set -e -u -for f in train.scp heldout_subset.scp train_subset.scp info.txt; do - if [ ! -f $processed_egs_dir/$f ]; then - echo "$0: expected file $processed_egs_dir/$f to exist." - exit 1 - fi -done - -if ! awk '/dir_type /{if ($2 != "processed_chaina_dir") exit(1); }' <$processed_egs_dir/info.txt; then - echo "$0: input directory $processed_egs_dir does not seem to be of the right type." +if ! steps/chaina/validate_processed_egs.sh $processed_egs_dir; then + echo "$0: could not validate input directory $processed_egs_dir" + exit 1 fi - # Work out how many groups per job and how many frames per job we'll have frames_per_group_avg=$(awk '/^frames_per_chunk_avg/ { fpc=$2; } /^chunks_per_group/ { print int(fpc * $2); }') @@ -144,7 +137,7 @@ utils/apply_map.pl -f 2 $dir/temp/block2rand <$dir/temp/key2block | \ # match the order in key2block_rand (which has the order of blocks # of lines randomly moved around). awk '{print $1, $1}' $dir/temp/key2block_rand | \ - utils/apply_map.pl $processed_egs_dir/train.scp \ + utils/apply_map.pl -f 2 $processed_egs_dir/train.scp \ >$dir/temp/train.scp_rand @@ -161,7 +154,7 @@ cp $processed_egs_dir/heldout_subset.scp $processed_egs_dir/train_subset.scp $di cat $processed_egs_dir/info.txt | awk ' - /^dir_type/ { print "dir_type processed_chaina_egs"; next; } + /^dir_type/ { print "dir_type randomized_chaina_egs"; next; } /^lang / { print "langs", $2; next } /^num_input_frames/ { print $2 * num_repeats; next; } # approximate; ignores held-out egs. {print;} diff --git a/egs/wsj/s5/steps/chaina/validate_processed_egs.sh b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh index d472484a035..c25f4a89a01 100755 --- a/egs/wsj/s5/steps/chaina/validate_processed_egs.sh +++ b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh @@ -2,8 +2,9 @@ # Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. # -# This script validates a directory containing 'raw' egs for 'chaina' training. -# It also helps to document the expectations on such a directory. +# This script validates a directory containing 'processed' egs for 'chaina' +# training, i.e. the output of process_egs.sh. It also helps to document the +# expectations on such a directory. if [ -f path.sh ]; then . ./path.sh; fi @@ -45,4 +46,4 @@ for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do fi done -echo "$0: sucessefully validated raw egs in $dir" +echo "$0: sucessefully validated processed egs in $dir" diff --git a/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh b/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh new file mode 100755 index 00000000000..32a97069f7d --- /dev/null +++ b/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script validates a directory containing 'randomized' egs for 'chaina' +# training, i.e. the output of randomize_egs.sh (this is the final form of the +# egs which is consumed by the training script). It also helps to document the +# expectations on such a directory. + + +if [ -f path.sh ]; then . ./path.sh; fi + + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/chaina/tdnn1a_sp/egs" + echo "" + echo "Validates that the final (ranodmized) egs dir has the expected format" +fi + +dir=$1 + +# Note: the .ark files are not actually consumed directly downstream (only via +# the top-level .scp files), but we check them anyway for now. +for f in $dir/train.1.scp $dir/info.txt \ + $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp}; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + + +if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "ranodmized_chaina_egs" ]; then + grep dir_type $dir/info.txt + echo "$0: dir_type should be randomized_chaina_egs in $dir/info.txt" + exit 1 +fi + +langs=$(awk '/^langs / {$1 = ""; print; }' <$dir/info.txt) +num_scp_files=$(awk '/^num_scp_files / { print $2; }' <$dir/info.txt) + +if [ -z "$langs" ]; then + echo "$0: expecting the list of languages to be nonempty in $dir/info.txt" + exit 1 +fi + +for lang in $langs; do + for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi + done +done + +for i in $(seq $num_scp_files); do + if ! [ -s $dir/train.$i.scp ]; then + echo "$0: expected file $dir/train.$i.scp to exist and be nonempty." + exit 1 + fi +done + + +echo "$0: sucessefully validated randomized egs in $dir" diff --git a/egs/wsj/s5/steps/chaina/validate_raw_egs.sh b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh index e2a29f96b55..c06920d58c5 100755 --- a/egs/wsj/s5/steps/chaina/validate_raw_egs.sh +++ b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh @@ -21,7 +21,7 @@ dir=$1 for f in $dir/all.scp $dir/cegs.1.ark $dir/info.txt \ $dir/misc/utt2spk; do - if ! [ -f $f -a -s $f ]; then + if ! [ -s $f ]; then echo "$0: expected file $f to exist and be nonempty." exit 1 fi @@ -37,7 +37,7 @@ fi lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do - if ! [ -f $f -a -s $f ]; then + if ! [ -s $f ]; then echo "$0: expected file $f to exist and be nonempty." exit 1 fi From b337595618012932cfd54b919e57412a6c9ed6b5 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 6 Jan 2019 11:56:13 -0500 Subject: [PATCH 58/87] Implement choose_egs_to_merge.py --- .../chaina/internal/choose_egs_to_merge.py | 349 ++++++++++++++---- 1 file changed, 278 insertions(+), 71 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py index 60a56a5bca9..536d38adf95 100755 --- a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -1,87 +1,108 @@ #!/usr/bin/env python3 # Copyright 2018 Johns Hopkins University (author: Daniel Povey) +# Copyright 2018 Hossein Hadian + # License: Apache 2.0. import os import argparse import sys import re +import logging +import traceback +import random + +sys.path.insert(0, 'steps') + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Start generating multilingual examples') + + +def get_args(): + parser = argparse.ArgumentParser(description="Chooses groups of examples to merge into groups " + "of size given by the --chunks-per-spk option, based on speaker " + "information (preferentially, chunks from the same utterance " + "and, if possible, the same speaker, get combined into " + "groups). This script also computes a held-out subset of...", + epilog="E.g. " + sys.argv[0] + "*** TODO *** ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + # Also maybe have --num-repeats, which must divide --chunks-per-spk? Can be + # used to divide data into different groups than the default ones. -parser = argparse.ArgumentParser(description="Chooses groups of examples to merge into groups " - "of size given by the --chunks-per-spk option, based on speaker " - "information (preferentially, chunks from the same utterance " - "and, if possible, the same speaker, get combined into " - "groups). This script also computes a held-out subset of...", - epilog="E.g. " + sys.argv[0] + "*** TODO *** ", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - -# Also maybe have --num-repeats, which must divide --chunks-per-spk? Can be -# used to divide data into different groups than the default ones. - - -parser.add_argument("--chunks-per-spk", type=int, default=4, - help="Number of chunks per speaker in the final egs (actually " - "means the number of chunks per group of chunks, and they are " - "only preferentially taken from the same speaker.") -parser.add_argument("--num-repeats", type=int, default=1, - help="The number of times the data is to be repeated. Must divide " - "--chunks-per-spk. Suggest to try only 1 or 2. The idea " - "is to divide chunks into groups in different ways, to give " - "more variety to the egs (since the adaptation information " - "will differ.") -parser.add_argument("--heldout-data-selection-proportion", type=float, - default=0.2, - help="This parameter governs the selection of the heldout " - "subset and the statistically matched training subset. " - "It does not affect the size of that subset, but only " - "affects what pool the examples are drawb from. " - "Smaller values of this mean that the heldout groups " - "will be preferentially drawn from groups that " - "'contaminate' the least number of other groups, " - "and so require the least data to be removed from the " - "training set. Setting this to 1.0 would mean that " - "the heldout subset is drawn completely at random " - "(which might be more wasteful of training data, but " - "gives a selection that's statistically more " - "representative).") -parser.add_argument("--num-heldout-groups", type=int, default=200, - help="Number of utterance groups " - "that will go in the heldout subset (and in the " - "statistically matched training subset)") -parser.add_argument("--utt2uniq", type=str, default='', - help="File used in setups with data " - "augmentation, that maps from utterance-ids to the " - "pre-augmentation utterance-id. The reason it's needed " - "is to ensure that the heldout set is properly held " - "out (i.e., that different versions of those utterances " - "weren't trained on. If not specified, we assume the " - "identity map.") -parser.add_argument("--scp-in", type=str, required=True, - help="The scp file in, likely containing chain egs. The " - "keys are expected to be of the form: " - "'-----v1', " - "where the left_context, num_frames and right_context are required to be the " - "same in order for keys to be in a group (note: it's best if the " - "--extra-left-context-initial and --extra-right-context-final options " - "are not used, and if the --frames-per-chunk is a single number, in " - "order to prevent this constraint from splitting up the utterances from " - "a single speaker") -parser.add_argument("--training-data-out", type=str, required=True, - help="The output file containing the chunks that are to be grouped; each " - "line will contain --chunks-per-spk (e.g. 4) rxfilenames, obtained " - "from the second field of the input --scp-in file.") -parser.add_argument("--heldout-subset-out", type=str, required=True, - help="This is the name of the file to which the heldout data subset " - "will be written; the format is the same as --training-data-out.") -parser.add_argument("--training-subset-out", type=str, required=True, - help="This is the name of the file to which the statistically matched " - "(to --heldout-subset-out) set of training data will be written") - -args = parser.parse_args() + + parser.add_argument("--chunks-per-spk", type=int, default=4, + help="Number of chunks per speaker in the final egs (actually " + "means the number of chunks per group of chunks, and they are " + "only preferentially taken from the same speaker.") + parser.add_argument("--num-repeats", type=int, default=1, + help="The number of times the data is to be repeated. Must divide " + "--chunks-per-spk. Suggest to try only 1 or 2. The idea " + "is to divide chunks into groups in different ways, to give " + "more variety to the egs (since the adaptation information " + "will differ.") + parser.add_argument("--heldout-data-selection-proportion", type=float, + default=0.2, + help="This parameter governs the selection of the heldout " + "subset and the statistically matched training subset. " + "It does not affect the size of that subset, but only " + "affects what pool the examples are drawb from. " + "Smaller values of this mean that the heldout groups " + "will be preferentially drawn from groups that " + "'contaminate' the least number of other groups, " + "and so require the least data to be removed from the " + "training set. Setting this to 1.0 would mean that " + "the heldout subset is drawn completely at random " + "(which might be more wasteful of training data, but " + "gives a selection that's statistically more " + "representative).") + parser.add_argument("--num-heldout-groups", type=int, default=200, + help="Number of utterance groups " + "that will go in the heldout subset (and in the " + "statistically matched training subset)") + parser.add_argument("--utt2uniq", type=str, default='', + help="File used in setups with data " + "augmentation, that maps from utterance-ids to the " + "pre-augmentation utterance-id. The reason it's needed " + "is to ensure that the heldout set is properly held " + "out (i.e., that different versions of those utterances " + "weren't trained on. If not specified, we assume the " + "identity map.") + parser.add_argument("--scp-in", type=str, required=True, + help="The scp file in, likely containing chain egs. The " + "keys are expected to be of the form: " + "'-----v1', " + "where the left_context, num_frames and right_context are required to be the " + "same in order for keys to be in a group (note: it's best if the " + "--extra-left-context-initial and --extra-right-context-final options " + "are not used, and if the --frames-per-chunk is a single number, in " + "order to prevent this constraint from splitting up the utterances from " + "a single speaker") + parser.add_argument("--training-data-out", type=str, required=True, + help="The output file containing the chunks that are to be grouped; each " + "line will contain --chunks-per-spk (e.g. 4) rxfilenames, obtained " + "from the second field of the input --scp-in file.") + parser.add_argument("--heldout-subset-out", type=str, required=True, + help="This is the name of the file to which the heldout data subset " + "will be written; the format is the same as --training-data-out.") + parser.add_argument("--training-subset-out", type=str, required=True, + help="This is the name of the file to which the statistically matched " + "(to --heldout-subset-out) set of training data will be written") + + print(sys.argv, file=sys.stderr) + args = parser.parse_args() + + return args # TODO: please print the command line to stderr for logging purposes. @@ -156,3 +177,189 @@ dying). """ + +class Chunk: + """ This is a data structure for a chunk. A chunk is a single entry + of the --scp-in file. + """ + def __init__(self, scp_line): + result = re.match("^(.*)-(\d+)-(\d+)-(\d+)-(\d+)-v1\s+(.*)$", scp_line) + self.utt_id, first_frame, left_context, num_frames, right_context, self.eg = result.groups() + self.chunk_id = self.utt_id + '-' + first_frame + self.context_structure = '-'.join((left_context, num_frames, right_context)) + def __repr__(self): + return '{}-{} {}'.format(self.chunk_id, self.context_structure, self.eg) + + +def read_all_chunks(scp_file): + """ Loads all the lines of the --scp-in file as chunk objects. + """ + chunks = [] + with open(scp_file, 'r', encoding='latin-1') as f: + for line in f: + try: + chunks.append(Chunk(line.strip())) + except: + logger.error('Bad line: ' + line.strip()) + raise + return chunks + +def load_utt2uniq(filename): + """ Loads the --utt2uniq file as a dict. + """ + utt2uniq = {} + with open(filename, 'r', encoding='latin-1') as f: + for line in f: + uttid, base_uttid = line.strip().split() + utt2uniq[uttid] = base_uttid + return utt2uniq + +def write_egs(filename, group_indexes, all_groups): + """ Writes the output data of this program, i.e. the second field of + the --scp-in file for specific chunks specified by `group_indexes`. + """ + with open(filename, 'w', encoding='latin-1') as f: + for group_index in group_indexes: + for chunk in all_groups[group_index]: + f.write('{}\n'.format(chunk.eg)) + + + +def choose_egs(args): + """ The main part of the program. + """ + + all_chunks = read_all_chunks(args.scp_in) + logger.info('Loaded {} chunks.'.format(len(all_chunks))) + + chunk_to_sublist = {} + for chunk in all_chunks: + if chunk.context_structure not in chunk_to_sublist: + chunk_to_sublist[chunk.context_structure] = [chunk] + else: + chunk_to_sublist[chunk.context_structure].append(chunk) + + logger.info('Created {} sub-lists with uniqe context ' + 'structure.'.format(len(chunk_to_sublist))) + + + assert(args.num_repeats == 1 or args.num_repeats == 2) + groups = [] + for sublist in chunk_to_sublist.values(): + logger.info('Processing chunks with context ' + 'structure: {}'.format(sublist[0].context_structure)) + num_groups = (len(sublist) + + args.chunks_per_spk - 1) // args.chunks_per_spk + for i in range(num_groups): + group = sublist[i*args.chunks_per_spk : (i+1)*args.chunks_per_spk] + groups.append(group) + if args.num_repeats == 2: + group = sublist[i * args.chunks_per_spk + 2 : + (i + 1) * args.chunks_per_spk + 2] + if group: + groups.append(group) + + logger.info('Created {} groups.'.format(len(groups))) + #for i in range(len(groups)): + # print('Group {}: {}'.format(i, str(groups[i]))) + utt2uniq = {} + if args.utt2uniq: + utt2uniq = load_utt2uniq(args.utt2uniq) + logger.info('Loaded utt2uniq file with {} entries.'.format(len(utt2uniq))) + else: + logger.info('--utt2uniq not specified; using identity map.') + + + uniq_to_groups = {} # uniq to set of groups that include it + for i, group in enumerate(groups): + for chunk in group: + uniq = utt2uniq.get(chunk.utt_id, chunk.utt_id) + if uniq not in uniq_to_groups: + uniq_to_groups[uniq] = set([i]) + else: + uniq_to_groups[uniq].add(i) + + logger.info('Computed uniq-to-groups for {} uniqs. Average number of ' + 'groups representing a uniq is ' + '{}'.format(len(uniq_to_groups), + sum([len(g) for g in uniq_to_groups.values()]) / + len(uniq_to_groups))) + # This is indexed by group-index (same len as groups). other_groups[i] is + # the set of other groups which share some utterance with group i. + other_groups = [set() for g in groups] + for i, group in enumerate(groups): + for chunk in group: + uniq = utt2uniq.get(chunk.utt_id, chunk.utt_id) + other_groups_this_uniq = uniq_to_groups[uniq] + other_groups[i].update(other_groups_this_uniq) + + for i, other in enumerate(other_groups): + other.remove(i) + + # 'group_shared_size' is a list of pairs (i, n) where i is group-index and + # n is the number of groups that we'd + # have to hold out if we were to put that group in the heldout set. + group_shared_size = [(i, len(other)) for i, other in enumerate(other_groups)] + # Sort it on n: + group_shared_size.sort(key=lambda tup: tup[1]) + + total_num_groups = len(groups) + training_set = set(range(total_num_groups)) # All groups + candidate_set_size = int(args.heldout_data_selection_proportion + * total_num_groups) + logger.info('Initial candidate set size: {}'.format(candidate_set_size)) + if args.num_heldout_groups > candidate_set_size: + logger.error('args.heldout_data_selection_proportion is too small or ' + 'there are too few groups.') + sys.exit(1) + + candidate_set = set([tup[0] for tup in group_shared_size[:candidate_set_size]]) + heldout_list = random.sample(candidate_set, args.num_heldout_groups) + + + # Remove all the heldout groups (and any other groups sharing some utterance + # with them) from both the candidate set and the training set + for group_index in heldout_list: + for shared_group_index in other_groups[group_index]: + candidate_set.discard(shared_group_index) + training_set.discard(shared_group_index) + candidate_set.discard(group_index) + training_set.discard(group_index) + + logger.info('Candidate set size after removing heldout ' + 'groups: {}'.format(len(candidate_set))) + if args.num_heldout_groups > len(candidate_set): + logger.warn('Not enough groups left in the candidate set. Doubling it.') + candidate_set = set([tup[0] for tup in + group_shared_size[:candidate_set_size * 2]]) + for group_index in heldout_list: + for shared_group_index in other_groups[group_index]: + candidate_set.discard(shared_group_index) + candidate_set.discard(group_index) + logger.info('Candidate set size after doubling and removing heldout ' + 'groups: {}'.format(len(candidate_set))) + if args.num_heldout_groups > len(candidate_set): + logger.error('args.heldout_data_selection_proportion is too small ' + 'or there are too few groups. Not enough groups left.') + sys.exit(1) + + train_subset_list = random.sample(candidate_set, args.num_heldout_groups) + + + # Write the outputs: + write_egs(args.training_data_out, training_set, groups) + write_egs(args.heldout_subset_out, heldout_list, groups) + write_egs(args.training_subset_out, train_subset_list, groups) + + +def main(): + try: + args = get_args() + choose_egs(args) + except Exception as e: + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() From c0eab12fa14dfedb94d3056187d8965b63bcfdb0 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 6 Jan 2019 12:03:43 -0500 Subject: [PATCH 59/87] Remove comments --- egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py index fc0ab74be4a..56920a64662 100755 --- a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -261,8 +261,7 @@ def choose_egs(args): groups.append(group) logger.info('Created {} groups.'.format(len(groups))) - #for i in range(len(groups)): - # print('Group {}: {}'.format(i, str(groups[i]))) + utt2uniq = {} if args.utt2uniq: utt2uniq = load_utt2uniq(args.utt2uniq) From 8e894b14de6bd667c522a4bde47a3e557eec3fc2 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 6 Jan 2019 12:12:32 -0500 Subject: [PATCH 60/87] Some cleanup --- .../chaina/internal/choose_egs_to_merge.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py index 56920a64662..52582fdcde4 100755 --- a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -23,7 +23,7 @@ "%(funcName)s - %(levelname)s ] %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) -logger.info('Start generating multilingual examples') +logger.info('Starting choose_egs_to_merge.py') @@ -37,10 +37,6 @@ def get_args(): epilog="E.g. " + sys.argv[0] + "*** TODO *** ", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - # Also maybe have --num-repeats, which must divide --chunks-per-group? Can be - # used to divide data into different groups than the default ones. - - parser.add_argument("--chunks-per-group", type=int, default=4, help="Number of chunks per speaker in the final egs (actually " "means the number of chunks per group of chunks, and they are " @@ -105,9 +101,6 @@ def get_args(): return args -# TODO: please print the command line to stderr for logging purposes. -# Any useful debugging messages can go to stderr too. - """ Notes on plan for how to implement this (we can keep this as documentation, but we'll maybe move some of it around when things get implemented). @@ -181,6 +174,7 @@ def get_args(): class Chunk: """ This is a data structure for a chunk. A chunk is a single entry of the --scp-in file. + 'eg' second field of --scp-in file """ def __init__(self, scp_line): result = re.match("^(.*)-(\d+)-(\d+)-(\d+)-(\d+)-v1\s+(.*)$", scp_line) @@ -215,7 +209,7 @@ def load_utt2uniq(filename): return utt2uniq def write_egs(filename, group_indexes, all_groups): - """ Writes the output data of this program, i.e. the second field of + """ Writes the output egs, i.e. the second field of the --scp-in file for specific chunks specified by `group_indexes`. """ with open(filename, 'w', encoding='latin-1') as f: @@ -244,7 +238,7 @@ def choose_egs(args): assert(args.num_repeats == 1 or args.num_repeats == 2) - groups = [] + groups = [] # All groups from all sub-lists for sublist in chunk_to_sublist.values(): logger.info('Processing chunks with context ' 'structure: {}'.format(sublist[0].context_structure)) @@ -260,7 +254,7 @@ def choose_egs(args): if group: groups.append(group) - logger.info('Created {} groups.'.format(len(groups))) + logger.info('Created a total of {} groups.'.format(len(groups))) utt2uniq = {} if args.utt2uniq: @@ -284,6 +278,7 @@ def choose_egs(args): '{}'.format(len(uniq_to_groups), sum([len(g) for g in uniq_to_groups.values()]) / len(uniq_to_groups))) + # This is indexed by group-index (same len as groups). other_groups[i] is # the set of other groups which share some utterance with group i. other_groups = [set() for g in groups] @@ -293,7 +288,7 @@ def choose_egs(args): other_groups_this_uniq = uniq_to_groups[uniq] other_groups[i].update(other_groups_this_uniq) - for i, other in enumerate(other_groups): + for i, other in enumerate(other_groups): # Remove self other.remove(i) # 'group_shared_size' is a list of pairs (i, n) where i is group-index and From 00ef41eb378bbc11adaeed053ff364ab1e2bfa13 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 6 Jan 2019 12:07:53 -0800 Subject: [PATCH 61/87] [scripts,egs] Small fixes/progress --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 28 +++++++++++-------- egs/wsj/s5/steps/chaina/randomize_egs.sh | 2 +- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 33440cd9495..8d1f1f4c635 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -283,6 +283,15 @@ egs_left_context=$[[model_left_context+egs_extra_left_context]] egs_right_context=$[[model_right_context+egs_extra_right_context]] +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ]; then + for d in $dir/raw_egs $dir/processed_egs; do + mkdir -p $d + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage + done +fi + + if [ $stage -le 18 ]; then echo "$0: about to dump raw egs." # Dump raw egs. @@ -302,20 +311,17 @@ if [ $stage -le 19 ]; then --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs fi +if [ $stage -le 20 ]; then + echo "$0: about to randomize egs" + steps/chaina/randomize_egs.sh --frames-per-job 3000000 \ + ${dir}/processed_egs ${dir}/egs +fi + - for d in $dir/raw_egs $dir/merged_egs; do - mkdir -p $d - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage - fi - done - mkdir -p $dir/raw_egs - steps/chaina/get_raw_egs.sh --lang default \ - ${train_data_dir} $dir exp/tri3_lats $dir/raw_egs -fi +exit 0; + # Work out the model # The following script is equivalent to doing something like the diff --git a/egs/wsj/s5/steps/chaina/randomize_egs.sh b/egs/wsj/s5/steps/chaina/randomize_egs.sh index 878c3d31001..6c49f5112ab 100755 --- a/egs/wsj/s5/steps/chaina/randomize_egs.sh +++ b/egs/wsj/s5/steps/chaina/randomize_egs.sh @@ -49,7 +49,7 @@ if [ -f path.sh ]; then . ./path.sh; fi if [ $# != 2 ]; then echo "Usage: $0 [opts] " - echo " e.g.: $0 --frames-per-job 200000 exp/chaina/tdnn1a_sp/processed_egs exp/chaina/tdnn1a_sp/egs" + echo " e.g.: $0 --frames-per-job 2000000 exp/chaina/tdnn1a_sp/processed_egs exp/chaina/tdnn1a_sp/egs" echo "" echo "Main options (for others, see top of script file)" echo " --config # config file containing options (alternative to this" From 0f3adb8faeab89cabdbeaf806151a2c9aec44c07 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 6 Jan 2019 16:05:47 -0500 Subject: [PATCH 62/87] Some bugfixes --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 8 ++++---- src/adapt/differentiable-fmllr.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 8d1f1f4c635..0e22fc0ae66 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -60,7 +60,7 @@ fi # The iVector-extraction and feature-dumping parts are the same as the standard # nnet3 setup, and you can skip them by setting "--stage 11" if you have already # run those things. -local/chaina/run_data_prep.sh.sh --stage $stage \ +local/chaina/data_prep_common.sh --stage $stage \ --train-set $train_set \ --gmm $gmm || exit 1; @@ -75,7 +75,7 @@ dir=exp/chaina/tdnn${affix}_sp train_data_dir=data/${train_set}_sp_hires2 lores_train_data_dir=data/${train_set}_sp -for f in $gmm_dir/final.mdl $train_data_dir/feats.sc \ +for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \ $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 done @@ -156,14 +156,14 @@ if [ $stage -le 13 ]; then batchnorm-component name=input-batchnorm - relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Splice(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 # this 'batchnorm-layer' has an affine component but no nonlinearlity linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 - batchnorm-component name=linear_bottleneck_bn dim=256 + batchnorm-component name=linear_bottleneck_bn output name=output input=linear_bottleneck EOF steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ diff --git a/src/adapt/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h index d67519e57c2..db8637a3ded 100644 --- a/src/adapt/differentiable-fmllr.h +++ b/src/adapt/differentiable-fmllr.h @@ -28,6 +28,8 @@ #include "util/kaldi-holder.h" #include "hmm/posterior.h" #include "matrix/matrix-functions.h" +#include "matrix/matrix-common.h" +#include "matrix/sp-matrix.h" namespace kaldi { namespace differentiable_transform { From eeacc97cc7166ca45b7ee29e9ac107c8c608820a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 6 Jan 2019 13:34:32 -0800 Subject: [PATCH 63/87] [src] Add missing files --- src/nnet3abin/Makefile | 26 +++++ src/nnet3abin/nnet3-adapt.cc | 172 ++++++++++++++++++++++++++++ src/nnet3abin/nnet3-chaina-train.cc | 115 +++++++++++++++++++ 3 files changed, 313 insertions(+) create mode 100644 src/nnet3abin/Makefile create mode 100644 src/nnet3abin/nnet3-adapt.cc create mode 100644 src/nnet3abin/nnet3-chaina-train.cc diff --git a/src/nnet3abin/Makefile b/src/nnet3abin/Makefile new file mode 100644 index 00000000000..d763dcf9cc5 --- /dev/null +++ b/src/nnet3abin/Makefile @@ -0,0 +1,26 @@ + +all: +EXTRA_CXXFLAGS = -Wno-sign-compare +include ../kaldi.mk + +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +BINFILES = nnet3-adapt nnet3-chaina-train + +OBJFILES = + +# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure. +cuda-compiled.o: ../kaldi.mk + +TESTFILES = + +ADDLIBS = ../nnet3a/kaldi-nnet3a.a ../adapt/kaldi-adapt.a ../nnet3/kaldi-nnet3.a \ + ../chain/kaldi-chain.a \ + ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \ + ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ + ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \ + ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/nnet3abin/nnet3-adapt.cc b/src/nnet3abin/nnet3-adapt.cc new file mode 100644 index 00000000000..547a18fb62d --- /dev/null +++ b/src/nnet3abin/nnet3-adapt.cc @@ -0,0 +1,172 @@ +// nnet3abin/nnet3-adapt.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-nnet.h" +#include "hmm/transition-model.h" +#include "adapt/differentiable-transform-itf.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::differentiable_transform; + typedef kaldi::int32 int32; + + const char *usage = + "Initialize nnet3 neural network from a config file; outputs 'raw' nnet\n" + "without associated information such as transition model and priors.\n" + "Search for examples in scripts in /egs/wsj/s5/steps/nnet3/\n" + "Can also be used to add layers to existing model (provide existing model\n" + "as 1st arg)\n" + "\n" + "Usage: nnet3-adapt [options] init [] \n" + " e.g.: nnet3-adapt --num-classes=201 init init.aconfig 0.ada\n" + " or: nnet3-adapt init init.aconfig tree.map 0.ada\n" + " or: nnet3-adapt [options] copy \n" + " e.g.: nnet3-adapt copy --binary=false 0.ada 0.txt\n" + " or: nnet3-adapt info \n" + " e.g.: nnet3-adapt info 0.ada\n" + " or: nnet3-adapt [options] adapt \n" + "\n" + "See also: nnet3-chaina-train\n"; + + bool binary_write = true; + bool remove_pdf_map = false; + int32 num_classes = -1; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("num-classes", &num_classes, + "For 'init' command: number of classes the transform will " + "use (required if is not supplied)."); + po.Register("remove-pdf-map", &remove_pdf_map, + "For the 'copy' command: if true, the pdf_map will be " + "removed so that the transform will be based on " + "pdf-ids."); + + po.Read(argc, argv); + + + if (po.GetArg(1) == "init" && po.NumArgs() == 3) { + // This block does the "init" command where the tree.map was not provided. + if (num_classes <= 0) + KALDI_ERR << "The --num-classes option is required with the " + "'init' command."; + std::string config_rxfilename = po.GetArg(2), + transform_wxfilename = po.GetArg(3); + bool binary_in; // should be false. + Input ki(config_rxfilename, &binary_in); + DifferentiableTransformMapped transform; + + transform.transform = DifferentiableTransform::ReadFromConfig( + ki.Stream(), num_classes); + + WriteKaldiObject(transform, transform_wxfilename, binary_write); + return 0; + } else if (po.GetArg(1) == "init" && po.NumArgs() == 4) { + // This block does the "init" command where the tree.map was provided. + std::string config_rxfilename = po.GetArg(2), + tree_map_rxfilename = po.GetArg(3), + transform_wxfilename = po.GetArg(4); + + DifferentiableTransformMapped transform; + { // This block reads transform.pdf_map and sets up num_classes. + bool binary_in; + Input ki(tree_map_rxfilename, &binary_in); + ReadIntegerVector(ki.Stream(), binary_in, &(transform.pdf_map)); + if (transform.pdf_map.empty()) + KALDI_ERR << "Expected to be nonempty vector."; + int32 expected_num_classes = *std::max_element(transform.pdf_map.begin(), + transform.pdf_map.end()); + if (num_classes > 0 && num_classes != expected_num_classes) + KALDI_ERR << "The --num-classes given via the option " << num_classes + << " differs from the expected value given the tree-map: " + << expected_num_classes; + num_classes = expected_num_classes; + } + + bool binary_in; // should be false. + Input ki(config_rxfilename, &binary_in); + transform.transform = DifferentiableTransform::ReadFromConfig( + ki.Stream(), num_classes); + WriteKaldiObject(transform, transform_wxfilename, binary_write); + return 0; + } else if (po.GetArg(1) == "info" && po.NumArgs() == 2) { + std::string transform_rxfilename = po.GetArg(2); + DifferentiableTransformMapped transform; + ReadKaldiObject(transform_rxfilename, &transform); + std::cout << transform.Info(); + return 0; + } else if (po.GetArg(1) == "copy" && po.NumArgs() == 3) { + std::string transform_rxfilename = po.GetArg(2), + transform_wxfilename = po.GetArg(3); + DifferentiableTransformMapped transform; + ReadKaldiObject(transform_rxfilename, &transform); + if (remove_pdf_map) { + if (transform.pdf_map.empty()) { + KALDI_WARN << "--remove-pdf-map option: transform does not have a pdf-map."; + } else { + transform.transform->SetNumClasses(transform.pdf_map.size()); + transform.pdf_map.clear(); + } + } + WriteKaldiObject(transform, transform_wxfilename, binary_write); + return 0; + } else if (po.GetArg(1) == "adapt" && po.NumArgs() == 5) { + KALDI_ERR << "The 'adapt' command has not been implemented yet."; + return 0; + } else { + po.PrintUsage(); + exit(1); + } + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + +/* +Test script: + +cat < \n" + " \n" + "\n" + " should contain bottom.raw, and .mdl for each language \n" + " should contain .den.fst for each language \n" + " should contain .ada for each language \n" + " is a place to where bottom.raw and .raw for each language\n" + " that was seen in the egs, will be written.\n"; + + + int32 srand_seed = 0; + bool binary_write = true; + std::string use_gpu = "yes"; + NnetChainaTrainingOptions chaina_opts; + int32 job_id = 0; + + ParseOptions po(usage); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("job-id", &job_id, + "Job identifier, helps to determine pathnames of models written " + "to ."); + + chaina_opts.Register(&po); + RegisterCuAllocatorOptions(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() != 5) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + bool ok; + + std::string model_in_dir = po.GetArg(1), + den_fst_dir = po.GetArg(2), + transform_dir = po.GetArg(3), + egs_rspecifier = po.GetArg(4), + model_out_dir = po.GetArg(5); + + NnetChainaModels models(chaina_opts.nnet_config.zero_component_stats, + chaina_opts.bottom_model_test_mode, + chaina_opts.top_model_test_mode, + model_in_dir, den_fst_dir, transform_dir); + + { + NnetChainaTrainer trainer(chaina_opts, &models); + + SequentialNnetChainExampleReader example_reader(egs_rspecifier); + + for (; !example_reader.Done(); example_reader.Next()) + trainer.Train(example_reader.Key(), + example_reader.Value()); + + ok = trainer.PrintTotalStats(); + } + models.WriteRawModels(model_out_dir, binary_write, job_id); + +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + return (ok ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} From 8cc8068e7f5d07b0147472b86d56e9b8772d73b8 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 6 Jan 2019 13:44:07 -0800 Subject: [PATCH 64/87] [src] Add missing files --- src/nnet3a/nnet-chaina-training-test.cc | 44 ++++++ src/nnet3a/nnet-chaina-utils-test.cc | 57 ++++++++ src/nnet3a/nnet-chaina-utils.cc | 186 ++++++++++++++++++++++++ 3 files changed, 287 insertions(+) create mode 100644 src/nnet3a/nnet-chaina-training-test.cc create mode 100644 src/nnet3a/nnet-chaina-utils-test.cc create mode 100644 src/nnet3a/nnet-chaina-utils.cc diff --git a/src/nnet3a/nnet-chaina-training-test.cc b/src/nnet3a/nnet-chaina-training-test.cc new file mode 100644 index 00000000000..c570ba29340 --- /dev/null +++ b/src/nnet3a/nnet-chaina-training-test.cc @@ -0,0 +1,44 @@ +// nnet3/nnet-chaina-training-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3a/nnet-chaina-training.h" + +namespace kaldi { +namespace nnet3 { + + +void UnitTestCompile() { + // just testing the compilation works, i.e. that all member functions are + // defined + NnetChainaTrainingOptions config; + NnetChainaModels models(true, false, false, "a", "b", "c"); + NnetChainaTrainer trainer(config, &models); +} + + +} // namespace nnet3 +} // namespace kaldi + +int main() { + using namespace kaldi; + using namespace kaldi::nnet3; + SetVerboseLevel(2); + // KALDI_LOG << "Tests succeeded."; + return 0; +} diff --git a/src/nnet3a/nnet-chaina-utils-test.cc b/src/nnet3a/nnet-chaina-utils-test.cc new file mode 100644 index 00000000000..6dd9a942ad7 --- /dev/null +++ b/src/nnet3a/nnet-chaina-utils-test.cc @@ -0,0 +1,57 @@ +// nnet3/nnet-chaina-utils-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3a/nnet-chaina-utils.h" + +namespace kaldi { +namespace nnet3 { + +void UnitTestParseFromQueryString(){ + std::string value; + KALDI_ASSERT(ParseFromQueryString("abc", "d", &value) == false); + KALDI_ASSERT(ParseFromQueryString("abc?e=f", "d", &value) == false); + KALDI_ASSERT(ParseFromQueryString("abc?d=f", "d", &value) == true && + value == "f"); + KALDI_ASSERT(ParseFromQueryString("abc?dd=f", "d", &value) == false); + KALDI_ASSERT(ParseFromQueryString("abc?dd=f&d=gab", "d", &value) == true && + value == "gab"); + KALDI_ASSERT(ParseFromQueryString("abc?d=f&dd=gab", "d", &value) == true && + value == "f"); + KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=fda&dd=gab", "ex", &value) == true && + value == "fda"); + + + BaseFloat f; + KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=1.0&dd=gab", "ex", &f) == true && + f == 1.0); + KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=1.0&dd=gab", "e", &f) == false); +} + +} // namespace nnet3 +} // namespace kaldi + +int main() { + using namespace kaldi; + using namespace kaldi::nnet3; + SetVerboseLevel(2); + UnitTestParseFromQueryString(); + KALDI_LOG << "Tests succeeded."; + + return 0; +} diff --git a/src/nnet3a/nnet-chaina-utils.cc b/src/nnet3a/nnet-chaina-utils.cc new file mode 100644 index 00000000000..1a07bdc66b4 --- /dev/null +++ b/src/nnet3a/nnet-chaina-utils.cc @@ -0,0 +1,186 @@ +// nnet3/nnet-chaina-utils.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3/nnet-utils.h" +#include "nnet3a/nnet-chaina-utils.h" + +namespace kaldi { +namespace nnet3 { + +void FindChainaExampleStructure(const NnetChainExample &eg, + int32 *num_sequences, + int32 *chunks_per_spk, + int32 *first_input_t, + int32 *num_input_frames, + int32 *num_output_frames, + int32 *frame_subsampling_factor, + int32 *eg_left_context, + int32 *eg_right_context) { + if (eg.inputs.size() != 1 || + eg.inputs[0].name != "input") + KALDI_ERR << "Expected eg to have exactly one input, named 'input'"; + + if (eg.outputs.size() != 1 || + eg.outputs[0].name != "output") + KALDI_ERR << "Expected eg to have exactly one output, named 'output'"; + + + const NnetChainSupervision &supervision = eg.outputs[0]; + *num_sequences = supervision.supervision.num_sequences; + *chunks_per_spk = supervision.chunks_per_spk; + + KALDI_ASSERT(supervision.indexes.size() % *num_sequences == 0 && + !supervision.indexes.empty()); + KALDI_ASSERT(supervision.indexes[0] == Index() && + "Expected first index to have t=0,n=0,x=0"); + // We expect t to have the larger stride. + KALDI_ASSERT(supervision.indexes[1].n == 1 && + "Supervision is in an unexpected order"); + Index last_output_index = supervision.indexes.back(); + KALDI_ASSERT(last_output_index.n == *num_sequences - 1); + *num_output_frames = int32(supervision.indexes.size()) / *num_sequences; + int32 last_output_t = last_output_index.t; + KALDI_ASSERT(last_output_t % (*num_output_frames - 1) == 0); + *frame_subsampling_factor = last_output_t / (*num_output_frames - 1); + + + const NnetIo &input_io = eg.inputs[0]; + *first_input_t = - input_io.indexes[0].t; + if (input_io.indexes[0].t != *first_input_t + 1) { + KALDI_ERR << "Input indexes are in the wrong order or not consecutive: " + << input_io.indexes[0].t << " != " << (*first_input_t + 1); + } + Index last_input_index = input_io.indexes.back(); + KALDI_ASSERT(last_input_index.n == *num_sequences - 1); + int32 last_input_t = last_input_index.t; + *num_input_frames = last_input_t + 1 - *first_input_t; + + *eg_left_context = -(*first_input_t); + *eg_right_context = last_input_t - last_output_t; +} + + +bool ParseFromQueryString(const std::string &string, + const std::string &key_name, + std::string *value) { + size_t question_mark_location = string.find_last_of("?"); + if (question_mark_location == std::string::npos) + return false; + std::string key_name_plus_equals = key_name + "="; + // the following do/while and the initialization of key_name_location is a + // little convoluted. We want to find "key_name_plus_equals" but if we find + // it and it's not preceded by '?' or '&' then it's part of a longer key and we + // need to ignore it and see if there's a next one. + size_t key_name_location = question_mark_location; + do { + key_name_location = string.find(key_name_plus_equals, + key_name_location + 1); + } while (key_name_location != std::string::npos && + key_name_location != question_mark_location + 1 && + string[key_name_location - 1] != '&'); + + if (key_name_location == std::string::npos) + return false; + size_t value_location = key_name_location + key_name_plus_equals.length(); + size_t next_ampersand = string.find_first_of("&", value_location); + size_t value_len; + if (next_ampersand == std::string::npos) + value_len = std::string::npos; // will mean "rest of string" + else + value_len = next_ampersand - value_location; + *value = string.substr(value_location, value_len); + return true; +} + + +bool ParseFromQueryString(const std::string &string, + const std::string &key_name, + BaseFloat *value) { + std::string s; + if (!ParseFromQueryString(string, key_name, &s)) + return false; + bool ans = ConvertStringToReal(s, value); + if (!ans) + KALDI_ERR << "For key " << key_name << ", expected float but found '" + << s << "', in string: " << string; + return true; +} + + +bool ComputeEmbeddingTimes(int32 first_input_t, + int32 num_input_frames, + int32 num_output_frames, + int32 frame_subsampling_factor, + int32 bottom_subsampling_factor, + int32 bottom_left_context, + int32 bottom_right_context, + int32 top_left_context, + int32 top_right_context, + bool keep_embedding_context, + int32 *first_embedding_t, + int32 *num_embedding_frames) { + KALDI_ASSERT(num_input_frames > 0 && num_output_frames > 0 && + first_input_t <= 0 && frame_subsampling_factor > 0); + KALDI_ASSERT(bottom_subsampling_factor > 0 && + frame_subsampling_factor % bottom_subsampling_factor == 0); + KALDI_ASSERT(bottom_left_context >= 0 && bottom_right_context >= 0 && + top_left_context >= 0 && top_right_context >= 0); + + // below '_subsampled' means after dividing the 't' values by + // 'bottom_subsampling_factor'. + // Note: implicitly, the first frame required at the output is t=0. + int32 first_required_embedding_t_subsampled = -top_left_context, + last_required_embedding_t_subsampled = + num_output_frames - 1 + top_right_context; + + int32 first_computable_embedding_t = first_input_t + bottom_left_context, + last_computable_embedding_t = + first_input_t + num_input_frames - 1 - bottom_right_context; + + int32 b = bottom_subsampling_factor; + + // By adding b - 1 and doing division that rounds down (towards negative + // infinity, we effectively round up when computing + // first_computable_embedding_t / b, which is appropriate because + // we need the first multiple of b that's actually computable. + int32 first_computable_embedding_t_subsampled = + DivideRoundingDown(first_computable_embedding_t + b - 1, b), + last_computable_embedding_t_subsampled = + DivideRoundingDown(last_computable_embedding_t, b); + if (first_computable_embedding_t_subsampled > first_required_embedding_t_subsampled || + last_computable_embedding_t_subsampled < last_required_embedding_t_subsampled) { + KALDI_WARN << "The training examples have insufficient context vs. the models."; + return false; + } + if (keep_embedding_context) { + *first_embedding_t = first_computable_embedding_t_subsampled * b; + *num_embedding_frames = 1 + last_computable_embedding_t_subsampled - + first_computable_embedding_t_subsampled; + } else { + *first_embedding_t = first_required_embedding_t_subsampled * b; + *num_embedding_frames = 1 + last_required_embedding_t_subsampled - + first_required_embedding_t_subsampled; + } + return true; +} + + + +} // namespace nnet3 +} // namespace kaldi From 97f295c1d2e4837b4cacc01fd85e656fd9dba316 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 6 Jan 2019 17:10:18 -0500 Subject: [PATCH 65/87] minor bugfixes --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 2 +- src/nnet3a/nnet-chaina-training.cc | 15 +++++++++++++++ src/nnet3a/nnet-chaina-utils.cc | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 0e22fc0ae66..467a2a73c38 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -164,7 +164,7 @@ if [ $stage -le 13 ]; then # this 'batchnorm-layer' has an affine component but no nonlinearlity linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 batchnorm-component name=linear_bottleneck_bn - output name=output input=linear_bottleneck + output name=output input=linear_bottleneck_bn EOF steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ --config-file-out $dir/configs/bottom.config diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index c91ac3863d4..523973a72dd 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -230,6 +230,21 @@ NnetChainaTopTrainer::ComputationStructure::ComputationStructure( top_subsampling_factor(top_subsampling_factor) { } +NnetChainaBottomTrainer::ComputationStructure::ComputationStructure( + bool train_model, + int32 num_sequences, + int32 frames_per_sequence_in, + int32 frames_per_sequence_out, + int32 first_input_t, + int32 first_output_t): + train_model(train_model), + num_sequences(num_sequences), + frames_per_sequence_in(frames_per_sequence_in), + frames_per_sequence_out(frames_per_sequence_out), + first_input_t(first_input_t), + first_output_t(first_output_t) { } + + void NnetChainaTopTrainer::ConsolidateMemory() { ::kaldi::nnet3::ConsolidateMemory(nnet_); ::kaldi::nnet3::ConsolidateMemory(delta_nnet_); diff --git a/src/nnet3a/nnet-chaina-utils.cc b/src/nnet3a/nnet-chaina-utils.cc index 1a07bdc66b4..2f143e8672d 100644 --- a/src/nnet3a/nnet-chaina-utils.cc +++ b/src/nnet3a/nnet-chaina-utils.cc @@ -43,7 +43,7 @@ void FindChainaExampleStructure(const NnetChainExample &eg, const NnetChainSupervision &supervision = eg.outputs[0]; *num_sequences = supervision.supervision.num_sequences; - *chunks_per_spk = supervision.chunks_per_spk; + *chunks_per_spk = supervision.chunks_per_group; KALDI_ASSERT(supervision.indexes.size() % *num_sequences == 0 && !supervision.indexes.empty()); From 8ba1f827e4794220030b2ebcdb0436e1da27e660 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 7 Jan 2019 05:03:01 -0500 Subject: [PATCH 66/87] more small fixes --- egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh | 2 +- src/adapt/differentiable-transform-itf.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 467a2a73c38..379b99b8fb5 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -178,7 +178,7 @@ if [ $stage -le 14 ]; then # is not really a multilingual setup. # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match # with the dimension of this transform (256). - cat <NumClasses())); } From 8becfd5741e07526b55893067835573fece85db3 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 7 Jan 2019 12:02:34 -0800 Subject: [PATCH 67/87] [egs,scripts] small fix; add docs. --- egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh | 2 ++ egs/wsj/s5/steps/nnet3/chain/build_tree.sh | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 379b99b8fb5..08a4afa6b5f 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -205,6 +205,8 @@ if [ $stage -le 15 ]; then echo "$0: creating top model" cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh index 1782fb817f9..6fcbc472412 100755 --- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh @@ -63,6 +63,10 @@ if [ $# != 5 ]; then echo " --frame-subsampling-factor # Factor (e.g. 3) controlling frame subsampling" echo " # at the neural net output, so the frame rate at" echo " # the output is less than at the input." + echo " --alignment-subsampling-factor # Factor controlling subsampling of the input alignment." + echo " # By default it equal to the frame-subsampling-factor," + echo " # but (e.g.) if you use a low-frame-rate system to" + echo " # generate alignments, you might want to set this to 1." echo " --num-clusters # Default: none. E.g. 200; can be used if you want" echo " # a 2-level tree. Used in 'chaina' setup. The file" echo " # tree.map will be output in this case." From 795ec749c8e4dfe1def21ff3c3893a30981f489a Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 7 Jan 2019 17:21:41 -0500 Subject: [PATCH 68/87] More fixes --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 14 +++++------ egs/wsj/s5/steps/chaina/get_raw_egs.sh | 24 +++++++++---------- egs/wsj/s5/steps/chaina/process_egs.sh | 20 +++++++--------- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 08a4afa6b5f..0cd67aedd53 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -221,19 +221,19 @@ if [ $stage -le 15 ]; then prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts # .. and its speaker-independent version - prefinal-layer name=prefinal-chain input=prefinal-si-l $prefinal_opts small-dim=192 big-dim=768 + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts # adding the output layer for xent branch prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts # .. and its speaker-independent version - prefinal-layer name=prefinal-xent input=prefinal-si-l $prefinal_opts small-dim=192 big-dim=768 + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts EOF steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ --config-file-out $dir/configs/default.config - nnet3-init --srand=$srand $dir/configs/default.config 0 - | \ + nnet3-init --srand=$srand $dir/configs/default.config - | \ nnet3-am-init $tree_dir/final.mdl - $dir/0/default.mdl fi @@ -281,11 +281,11 @@ fi model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/0/info.txt) model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/0/info.txt) -egs_left_context=$[[model_left_context+egs_extra_left_context]] -egs_right_context=$[[model_right_context+egs_extra_right_context]] +egs_left_context=$[model_left_context+egs_extra_left_context] +egs_right_context=$[model_right_context+egs_extra_right_context] -if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ]; then +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [[ ! -d $dir/storage ]]; then for d in $dir/raw_egs $dir/processed_egs; do mkdir -p $d utils/create_split_dir.pl \ @@ -303,7 +303,7 @@ if [ $stage -le 18 ]; then --right-context $egs_right_context \ --frame-subsampling-factor $frame_subsampling_factor \ --alignment-subsampling-factor $frame_subsampling_factor \ - --frames-per-egs 150 \ + --frames-per-chunk 150 \ ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs fi diff --git a/egs/wsj/s5/steps/chaina/get_raw_egs.sh b/egs/wsj/s5/steps/chaina/get_raw_egs.sh index 50139a86208..cd2eb516215 100755 --- a/egs/wsj/s5/steps/chaina/get_raw_egs.sh +++ b/egs/wsj/s5/steps/chaina/get_raw_egs.sh @@ -126,8 +126,8 @@ dir=$4 tree=$chaindir/${lang}.tree trans_mdl=$chaindir/0/${lang}.mdl # contains the transition model and a nnet, but # we won't be making use of the nnet part. -normalization_fst=$chaindir/0/${lang}.normalization.fst -den_fst=$chaindir/0/${lang}.den.fst +normalization_fst=$chaindir/den_fsts/${lang}.normalization.fst +den_fst=$chaindir/den_fsts/${lang}.den.fst for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ $tree $trans_mdl $normalization_fst $den_fst; do @@ -148,7 +148,7 @@ mkdir -p $dir/log $dir/misc cp $tree $dir/misc/ copy-transition-model $trans_mdl $dir/misc/${lang}.trans_mdl cp $normalization_fst $den_fst $dir/misc/ -cp data/utt2spk $dir/misc/ +cp $data/utt2spk $dir/misc/ if [ -f $data/utt2uniq ]; then cp $data/utt2uniq $dir/misc/ elif [ -f $dir/misc/utt2uniq ]; then @@ -208,21 +208,21 @@ if [ $stage -le 0 ]; then lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \ "$lats_rspecifier" ark:- \| \ chain-get-supervision $chain_supervision_all_opts \ - $dir/misc/tree $dir/misc/${lang}.trans_mdl ark:- ark:- \| \ + $dir/misc/default.tree $dir/misc/${lang}.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ - "$normalization_fst" $sdata/JOB/feats.scp ark,s,cs:- \ + "$normalization_fst" scp:$sdata/JOB/feats.scp ark,s,cs:- \ ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp || exit 1; fi if [ $stage -le 1 ]; then - frames_and_chunks=$(for n in $(seq nj); do cat $dir/log/get_egs.$n.log; done | \ - perl -e '$nf=0;$nc=0; while() { if(m/with total length (\d+) frames.+ into (\d+) chunks/) { $nf += $1; $nc += $2; } print "$nf $nc";') - num_frames=$(echo $frames_and_chunks || awk '{print $1}') - num_chunks=$(echo $frames_and_chunks || awk '{print $2}') - frames_per_chunk_avg=$[$num_frames/$num_chunks] - feat_dim=$(feat-to-dim scp:$sdata/JOB/feats.scp -) - num_leaves=$(tree-info $chaindir/tree | awk '/^num-pdfs/ {print $2}') + frames_and_chunks=$(for n in $(seq $nj); do cat $dir/log/get_egs.$n.log; done | \ + perl -e '$nf=0;$nc=0; while() { if(m/with total length (\d+) frames.+ into (\d+) chunks/) { $nf += $1; $nc += $2; }} print "$nf $nc";') + num_frames=$(echo $frames_and_chunks | awk '{print $1}') + num_chunks=$(echo $frames_and_chunks | awk '{print $2}') + frames_per_chunk_avg=$[num_frames/num_chunks] + feat_dim=$(feat-to-dim scp:$sdata/1/feats.scp -) + num_leaves=$(tree-info $tree | awk '/^num-pdfs/ {print $2}') if [ $left_context_initial -lt 0 ]; then left_context_initial=$left_context fi diff --git a/egs/wsj/s5/steps/chaina/process_egs.sh b/egs/wsj/s5/steps/chaina/process_egs.sh index 41232a41972..34cc8869aac 100755 --- a/egs/wsj/s5/steps/chaina/process_egs.sh +++ b/egs/wsj/s5/steps/chaina/process_egs.sh @@ -84,7 +84,7 @@ dir=$2 # die on error or undefined variable. set -e -u -if ! steps/chaina/validate_raw_egs_dir $raw_egs_dir; then +if ! steps/chaina/validate_raw_egs.sh $raw_egs_dir; then echo "$0: failed to validate input directory $raw_egs_dir" exit 1 fi @@ -113,40 +113,38 @@ fi if [ $stage -le 1 ]; then for name in heldout_subset train_subset; do - echo "$0: merging and shuffling $train egs" + echo "$0: merging and shuffling $name egs" # Linearize these lists and add keys to make it an scp format. - awk '{for (n=1;n<=NF;n++) { count++; print count "-" $n; }' <$dir/temp/${name}.list >$dir/temp/${name}.scp + awk '{for (n=1;n<=NF;n++) { count++; print count, $n; }}' <$dir/temp/${name}.list >$dir/temp/${name}.scp $cmd $dir/log/merge_${name}_egs.log \ nnet3-chain-merge-egs --minibatch-size=$chunks_per_group --compress=$compress \ scp:$dir/temp/${name}.scp ark:- \| \ - nnet3-chain-shuffle-egs --srand=$srand $ark:- ark,scp:$dir/${name}.ark,$dir/${name}.scp + nnet3-chain-shuffle-egs --srand=$srand ark:- ark,scp:$dir/${name}.ark,$dir/${name}.scp done # Split up the training list into multiple smaller lists, as it could be long. - utils/split_scp.pl $dir/train.list $(for j in $(seq $nj); do echo $dir/temp/train.$j.list; done) + utils/split_scp.pl $dir/temp/train.list $(for j in $(seq $nj); do echo $dir/temp/train.$j.list; done) # Linearize these lists and add keys to make them in scp format; # nnet3-chain-merge-egs will merge the right groups, it's deterministic # and we specified --minibatch-size=$chunks_per_group. for j in $(seq $nj); do - awk '{for (n=1;n<=NF;n++) { count++; print count "-" $n; }' <$dir/temp/train.$j.list >$dir/temp/train.$j.scp + awk '{for (n=1;n<=NF;n++) { count++; print count, $n; }}' <$dir/temp/train.$j.list >$dir/temp/train.$j.scp done if [ -e $dir/storage ]; then # Make soft links to storage directories, if distributing this way.. See # utils/create_split_dir.pl. echo "$0: creating data links" - utils/create_data_link.pl $(for j in $(seq $nj); do echo $dir/train.$j.ark; done) + utils/create_data_link.pl $(for j in $(seq $nj); do echo $dir/train.$j.ark; done) || true fi - $cmd JOB=1:$nj $dir/log/merge_train_egs.JOB.log \ nnet3-chain-merge-egs --compress=$compress --minibatch-size=$chunks_per_group \ scp:$dir/temp/train.JOB.scp ark:- \| \ - nnet3-chain-shuffle-egs --shuffle-buffer-size=$shuffle_buffer_size \ + nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size \ --srand=\$[JOB+$srand] ark:- ark,scp:$dir/train.JOB.ark,$dir/train.JOB.scp - # the awk command is to ensure unique ids for each group. cat $(for j in $(seq $nj); do echo $dir/train.$j.scp; done) | awk '{printf("%09d %s\n", NR, $2);}' > $dir/train.scp fi @@ -155,7 +153,7 @@ fi cat $raw_egs_dir/info.txt | awk -v num_repeats=$num_repeats \ -v chunks_per_group=$chunks_per_group ' /^dir_type/ { print "dir_type processed_chaina_egs"; next; } - /^num_input_frames/ { print $2 * num_repeats; next; } # approximate; ignores held-out egs. + /^num_input_frames/ { print "num_input_frames "$2 * num_repeats; next; } # approximate; ignores held-out egs. {print;} END{print "chunks_per_group " chunks_per_group; print "num_repeats " num_repeats;}' >$dir/info.txt From 22799ef3f10c1ff23535d55d77a46691f0659c68 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 7 Jan 2019 17:30:26 -0500 Subject: [PATCH 69/87] Minor fix --- egs/wsj/s5/steps/chaina/get_raw_egs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/chaina/get_raw_egs.sh b/egs/wsj/s5/steps/chaina/get_raw_egs.sh index cd2eb516215..752a86787b6 100755 --- a/egs/wsj/s5/steps/chaina/get_raw_egs.sh +++ b/egs/wsj/s5/steps/chaina/get_raw_egs.sh @@ -208,7 +208,7 @@ if [ $stage -le 0 ]; then lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \ "$lats_rspecifier" ark:- \| \ chain-get-supervision $chain_supervision_all_opts \ - $dir/misc/default.tree $dir/misc/${lang}.trans_mdl ark:- ark:- \| \ + $dir/misc/${lang}.tree $dir/misc/${lang}.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ "$normalization_fst" scp:$sdata/JOB/feats.scp ark,s,cs:- \ ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp || exit 1; From 003982dd1d88f4f6858d9817531150510dcdb44b Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 13 Jan 2019 18:44:25 -0500 Subject: [PATCH 70/87] [src,scripts,egs] Further progress --- egs/mini_librispeech/s5/cmd.sh | 2 + .../s5/local/chaina/data_prep_common.sh | 6 +- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 41 ++++--- egs/wsj/s5/steps/chaina/get_raw_egs.sh | 8 +- .../chaina/internal/get_train_schedule.py | 97 +++++++++++++++ egs/wsj/s5/steps/chaina/process_egs.sh | 5 +- egs/wsj/s5/steps/chaina/randomize_egs.sh | 44 ++++--- egs/wsj/s5/steps/chaina/train.sh | 114 ++++++++++++++++++ .../s5/steps/chaina/validate_processed_egs.sh | 2 +- .../steps/chaina/validate_randomized_egs.sh | 10 +- egs/wsj/s5/steps/chaina/validate_raw_egs.sh | 2 +- src/Makefile | 10 +- src/chainbin/nnet3-chain-combine.cc | 4 +- src/featbin/select-feats.cc | 4 +- src/nnet3a/nnet-chaina-training.cc | 59 +++------ src/nnet3a/nnet-chaina-training.h | 20 ++- src/nnet3abin/nnet3-adapt.cc | 10 +- 17 files changed, 330 insertions(+), 108 deletions(-) create mode 100755 egs/wsj/s5/steps/chaina/internal/get_train_schedule.py create mode 100755 egs/wsj/s5/steps/chaina/train.sh diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh index 71dd849a93b..3189d83975a 100644 --- a/egs/mini_librispeech/s5/cmd.sh +++ b/egs/mini_librispeech/s5/cmd.sh @@ -10,6 +10,8 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +# in future I'd like to start using just one $cmd variable. +export cmd="queue.pl --mem 2G" export train_cmd="queue.pl --mem 2G" export decode_cmd="queue.pl --mem 4G" export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh b/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh index 087756a9ea0..a736fc8c008 100755 --- a/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh +++ b/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh @@ -51,7 +51,7 @@ if [ $stage -le 3 ]; then # Create high-resolution MFCC features (with 40 cepstra instead of 13). # this shows how you can split across multiple file-systems. echo "$0: creating high-resolution MFCC features" - mfccdir=data/${train_set}_sp_hires2/data + mfccdir=data/${train_set}_sp_hires/data if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi @@ -62,10 +62,10 @@ if [ $stage -le 3 ]; then # do volume-perturbation on the training data prior to extracting hires # features; this helps make trained nnets more invariant to test data volume. - utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires2 || exit 1; + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; for datadir in ${train_set}_sp ${test_sets}; do - steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires2.conf \ + steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/${datadir}_hires || exit 1; steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; utils/fix_data_dir.sh data/${datadir}_hires || exit 1; diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 0cd67aedd53..b99e35bd576 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -57,9 +57,6 @@ where "nvcc" is installed. EOF fi -# The iVector-extraction and feature-dumping parts are the same as the standard -# nnet3 setup, and you can skip them by setting "--stage 11" if you have already -# run those things. local/chaina/data_prep_common.sh --stage $stage \ --train-set $train_set \ --gmm $gmm || exit 1; @@ -72,7 +69,7 @@ tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix} lang=data/lang_chain lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats dir=exp/chaina/tdnn${affix}_sp -train_data_dir=data/${train_set}_sp_hires2 +train_data_dir=data/${train_set}_sp_hires lores_train_data_dir=data/${train_set}_sp for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \ @@ -253,9 +250,9 @@ if [ $stage -le 16 ]; then # # note: $langs is "default" steps/chaina/get_model_context.sh \ - --frame-subsampling-factor=$frame_subsampling_factor \ - --bottom-subsampling-factor=$bottom_subsampling_factor \ - --langs="$langs" $dir/0/ > $dir/0/info.txt + --frame-subsampling-factor $frame_subsampling_factor \ + --bottom-subsampling-factor $bottom_subsampling_factor \ + --langs "$langs" $dir/0/ $dir/0/info.txt fi @@ -281,23 +278,26 @@ fi model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/0/info.txt) model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/0/info.txt) -egs_left_context=$[model_left_context+egs_extra_left_context] -egs_right_context=$[model_right_context+egs_extra_right_context] - - -if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [[ ! -d $dir/storage ]]; then - for d in $dir/raw_egs $dir/processed_egs; do +# Note: we add frame_subsampling_factor/2 so that we can support the frame +# shifting that's done during training, so if frame-subsampling-factor=3, we +# train on the same egs with the input shifted by -1,0,1 frames. This is done +# via the --frame-shift option to nnet3-chain-copy-egs in the script. +egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context] +egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context] + +for d in $dir/raw_egs $dir/processed_egs; do + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then mkdir -p $d utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage - done -fi + fi +done if [ $stage -le 18 ]; then echo "$0: about to dump raw egs." # Dump raw egs. - steps/chaina/get_raw_egs.sh \ + steps/chaina/get_raw_egs.sh --cmd "$cmd" \ --lang "default" \ --left-context $egs_left_context \ --right-context $egs_right_context \ @@ -309,7 +309,7 @@ fi if [ $stage -le 19 ]; then echo "$0: about to process egs" - steps/chaina/process_egs.sh \ + steps/chaina/process_egs.sh --cmd "$cmd" \ --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs fi @@ -319,7 +319,14 @@ if [ $stage -le 20 ]; then ${dir}/processed_egs ${dir}/egs fi +if [ $stage -le 21 ]; then + echo "$0: about to train model" + steps/chaina/train.sh \ + --stage $train_stage --cmd "$cmd" \ + --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \ + --dropout-schedule "$dropout_schedule" \ +fi exit 0; diff --git a/egs/wsj/s5/steps/chaina/get_raw_egs.sh b/egs/wsj/s5/steps/chaina/get_raw_egs.sh index 752a86787b6..b637762bb56 100755 --- a/egs/wsj/s5/steps/chaina/get_raw_egs.sh +++ b/egs/wsj/s5/steps/chaina/get_raw_egs.sh @@ -72,6 +72,7 @@ lattice_prune_beam= # If supplied, the lattices will be pruned to this be acwt=0.1 # For pruning. Should be, for instance, 1.0 for chain lattices. deriv_weights_scp= +# end configuration section echo "$0 $@" # Print the command line for logging @@ -217,7 +218,10 @@ fi if [ $stage -le 1 ]; then frames_and_chunks=$(for n in $(seq $nj); do cat $dir/log/get_egs.$n.log; done | \ - perl -e '$nf=0;$nc=0; while() { if(m/with total length (\d+) frames.+ into (\d+) chunks/) { $nf += $1; $nc += $2; }} print "$nf $nc";') + perl -e '$nc=0; $nf=0; while() { + if (m/Split .+ into (\d+) chunks/) { $this_nc = $1; } + if (m/Average chunk length was (\d+) frames/) { $nf += $1 * $this_nc; $nc += $this_nc; } + } print "$nf $nc"; ') num_frames=$(echo $frames_and_chunks | awk '{print $1}') num_chunks=$(echo $frames_and_chunks | awk '{print $2}') frames_per_chunk_avg=$[num_frames/num_chunks] @@ -227,7 +231,7 @@ if [ $stage -le 1 ]; then left_context_initial=$left_context fi if [ $right_context_final -lt 0 ]; then - right_context_initial=$right_context + right_context_final=$right_context fi cat >$dir/info.txt <= --num-jobs-initial""") + parser.add_argument("--num-epochs", type=float, default=4.0, + help="""The number of epochs to train for. + Note: the 'real' number of times we see each + utterance is this number times --frame-subsampling-factor + (to cover frame-shifted copies of the data), times + the value of --num-repeats given to process_egs.sh, + times any factor arising from data augmentation.""") + parser.add_argument("--dropout-schedule", type=str, + help="""Use this to specify the dropout schedule (how the dropout probability varies + with time, 0 == no dropout). You specify a piecewise + linear function on the domain [0,1], where 0 is the + start and 1 is the end of training; the + function-argument (x) rises linearly with the amount of + data you have seen, not iteration number (this improves + invariance to num-jobs-{initial-final}). E.g. '0,0.2,0' + means 0 at the start; 0.2 after seeing half the data; + and 0 at the end. You may specify the x-value of + selected points, e.g. '0,0.2@0.25,0' means that the 0.2 + dropout-proportion is reached a quarter of the way + through the data. The start/end x-values are at + x=0/x=1, and other unspecified x-values are interpolated + between known x-values. You may specify different rules + for different component-name patterns using + 'pattern1=func1 pattern2=func2', e.g. 'relu*=0,0.1,0 + lstm*=0,0.2,0'. More general should precede less + general patterns, as they are applied sequentially.""") + + parser.add_argument("--schedule-out", type=str, required=True, + "Output file containing the training schedule. The output + is lines, one per training iteration. Each line contains + tab-separated fields of the form: + + where is an iteration index starting from 0, + is the number of jobs for this iteration (between + num-jobs-initial and num-jobs-final), + is a space-separated string containing the + indexes of the .scp files in the egs dir to use for this + iteration (e.g. '1 2 3'), is either the empty + string or something to be passed to the --edits command of + nnet3-am-copy or nnet3-copy; is the + actual learning rate on this iteration (the effective learning + rate times the num-jobs), and is a space-separated + string containing the frame shifts for each job.") + + + +def main(): + pass +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/steps/chaina/process_egs.sh b/egs/wsj/s5/steps/chaina/process_egs.sh index 34cc8869aac..e8d8cfeab4e 100755 --- a/egs/wsj/s5/steps/chaina/process_egs.sh +++ b/egs/wsj/s5/steps/chaina/process_egs.sh @@ -152,8 +152,9 @@ fi cat $raw_egs_dir/info.txt | awk -v num_repeats=$num_repeats \ -v chunks_per_group=$chunks_per_group ' - /^dir_type/ { print "dir_type processed_chaina_egs"; next; } - /^num_input_frames/ { print "num_input_frames "$2 * num_repeats; next; } # approximate; ignores held-out egs. + /^dir_type / { print "dir_type processed_chaina_egs"; next; } + /^num_input_frames / { print "num_input_frames "$2 * num_repeats; next; } # approximate; ignores held-out egs. + /^num_chunks / { print "num_chunks " $2 * num_repeats; next; } {print;} END{print "chunks_per_group " chunks_per_group; print "num_repeats " num_repeats;}' >$dir/info.txt diff --git a/egs/wsj/s5/steps/chaina/randomize_egs.sh b/egs/wsj/s5/steps/chaina/randomize_egs.sh index 6c49f5112ab..943d383c571 100755 --- a/egs/wsj/s5/steps/chaina/randomize_egs.sh +++ b/egs/wsj/s5/steps/chaina/randomize_egs.sh @@ -86,18 +86,20 @@ fi # Work out how many groups per job and how many frames per job we'll have -frames_per_group_avg=$(awk '/^frames_per_chunk_avg/ { fpc=$2; } /^chunks_per_group/ { print int(fpc * $2); }') +info_in=$processed_egs_dir/info.txt + +frames_per_group_avg=$(awk '/^frames_per_chunk_avg/ { fpc=$2; } /^chunks_per_group/ { print int(fpc * $2); }' $info_in) if ! [ $frames_per_group_avg -gt 0 ]; then echo "$0: error getting frames per group."; fi num_groups=$(wc -l <$processed_egs_dir/train.scp) -num_scp_files=$[[ (frames_per_group_avg + frames_per_job / 2) / frames_per_job ]] +num_scp_files=$[(frames_per_group_avg*num_groups + frames_per_job/2) / frames_per_job] [ $num_scp_files -eq 0 ] && num_scp_files=1 -frames_per_scp_file=$[[(frames_per_group_avg * num_groups) / num_scp_files]] -groups_per_scp_file=$[[ num_groups / num_scp_files]] +frames_per_scp_file=$[(frames_per_group_avg * num_groups) / num_scp_files] +groups_per_scp_file=$[ num_groups / num_scp_files] mkdir -p $dir/temp @@ -115,11 +117,12 @@ cp $processed_egs_dir/misc/* $dir/misc # do this in a specially-written python script, but instead we do it with a # combination of existing Kaldi and UNIX utilities. -awk '{block=sprintf("%05d", NR / groups_per_block); group_id=$1; print group_id, block;}' \ +awk -v gpb=$groups_per_block \ + '{block=sprintf("%05d", NR / gpb); group_id=$1; print group_id, block;}' \ <$processed_egs_dir/train.scp >$dir/temp/key2block # get list of blocks -awk '{print $2}' | uniq <$dir/temp/key2block > $dir/temp/blocks +awk '{print $2}' <$dir/temp/key2block | uniq > $dir/temp/blocks # get randomized-order list of blocks utils/shuffle_list.pl --srand "$srand" <$dir/temp/blocks > $dir/temp/blocks_rand # Map block-ids to randomized-order block-ids @@ -152,21 +155,34 @@ utils/split_scp.pl --utt2spk=$dir/temp/key2block_rand \ cp $processed_egs_dir/heldout_subset.scp $processed_egs_dir/train_subset.scp $dir/ +# note: there is only one language in $processed_egs_dir (any +# merging would be done at the randomization stage but that is not supported yet). + +lang=$(awk '/^lang / { print $2; }' <$processed_egs_dir/info.txt) + +# We'll store info files per language, containing the part of the information +# that is language-specific, plus a single global info.txt containing stuff that +# is not language specific. +# This will get more complicated once we actually support multiple languages, +# and when we allow multiple input processed egs dirs for the same language. -cat $processed_egs_dir/info.txt | awk ' - /^dir_type/ { print "dir_type randomized_chaina_egs"; next; } - /^lang / { print "langs", $2; next } - /^num_input_frames/ { print $2 * num_repeats; next; } # approximate; ignores held-out egs. - {print;} - END{print "chunks_per_group " chunks_per_group; print "num_repeats " num_repeats;}' >$dir/info.txt +grep -v -E '^dir_type|^lang|^feat_dim' <$processed_egs_dir/info.txt | \ + cat <(echo "dir_type randomized_chaina_egs") - > $dir/info_$lang.txt -cat <>$dir/info.txt + +cat <$dir/info.txt +dir_type randomized_chaina_egs num_scp_files $num_scp_files +langs $lang frames_per_scp_file $frames_per_scp_file groups_per_scp_file $groups_per_scp_file EOF +# frames_per_job, after rounding, becomes frames_per_scp_file. + +# note: frames_per_chunk_avg will be present in the info.txt file as well as +# the per-language files. +grep -E '^feat_dim|^frames_per_chunk_avg' <$processed_egs_dir/info.txt >>$dir/info.txt -# Note: frame_per_job, after rounding, becomes frames_per_scp_file. if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh new file mode 100755 index 00000000000..3e68e63aa9d --- /dev/null +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + + +# Begin configuration section +stage=0 +leaky_hmm_coefficient=0.1 +xent_regularize=0.1 +apply_deriv_weights=false # you might want to set this to true in unsupervised training + # scenarios. +memory_compression_level=2 # Enables us to use larger minibatch size than we + # otherwise could, but may not be optimal for speed + # (--> set to 0 if you have plenty of memory. +dropout_schedule= +srand=0 +max_param_change=2.0 +use_gpu=yes # can be "yes", "no", "optional", "wait" + +common_opts= # Options passed through to nnet3-chaina-train and nnet3-chaina-combine + +unadapted_top_weight=0.5 +unadapted_bottom_weight=0.5 + +num_epochs=4.0 # Note: each epoch may actually contain multiple repetitions of + # the data, for various reasons: + # using the --num-repeats option in process_egs.sh + # data augmentation + # different data shifts (this includes 3 different shifts + # of the data if frame_subsampling_factor=3 (see $dir/0/info.txt) + +num_jobs_initial=1 +num_jobs_final=1 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +num_groups_per_minibatch=32 # note: if chunks_per_group=4, this would mean 128 + # chunks per minibatch. + +max_iters_combine=80 +max_models_combine=20 + +# End configuration section + + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 [options] " + echo " e.g.: $0 exp/chaina/tdnn1a_sp/egs exp/chaina/tdnn1a_sp" + echo "" + echo " TODO: more documentation" +fi + +egs_dir=$1 +dir=$2 + +set -e -u # die on failed command or undefined variable + +steps/chaina/validate_randomized_egs.sh $egs_dir + +for f in $dir/0/info.txt $dir/0/bottom.raw; do + echo "$0: expected file $f to exist" + exit 1 +done + + +frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}') +num_scp_files=$(awk '/^num_scp_files/ {print $2}') + +steps/chaina/internal/get_train_schedule.py \ + --frame-subsampling-factor=$frame_subsampling_factor \ + --num-jobs-initial=$num_jobs_initial \ + --num-jobs-final=$num_jobs_final \ + --num-epochs=$num_epochs \ + --num-scp-files=$num_scp_files \ + --frame-subsampling-factor=$frame_subsampling_factor \ + --initial-effective-lrate=$initial_effective_lrate \ + --final-effective-lrate=$final_effective_lrate \ + --schedule-out=$dir/schedule.txt + + +# Note: the .ark files are not actually consumed directly downstream (only via +# the top-level .scp files), but we check them anyway for now. +for f in $dir/train.scp $dir/info.txt \ + $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp} \ + $dir/train.1.scp $dir/train.1.ark; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + + +if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "processed_chaina_egs" ]; then + grep dir_type $dir/info.txt + echo "$0: dir_type should be processed_chaina_egs in $dir/info.txt" + exit 1 +fi + +lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) + +for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + +echo "$0: sucessfully validated processed egs in $dir" diff --git a/egs/wsj/s5/steps/chaina/validate_processed_egs.sh b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh index c25f4a89a01..d928642dff9 100755 --- a/egs/wsj/s5/steps/chaina/validate_processed_egs.sh +++ b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh @@ -46,4 +46,4 @@ for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do fi done -echo "$0: sucessefully validated processed egs in $dir" +echo "$0: sucessfully validated processed egs in $dir" diff --git a/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh b/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh index 32a97069f7d..1eebc144347 100755 --- a/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh +++ b/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh @@ -15,7 +15,7 @@ if [ $# != 1 ]; then echo "Usage: $0 " echo " e.g.: $0 exp/chaina/tdnn1a_sp/egs" echo "" - echo "Validates that the final (ranodmized) egs dir has the expected format" + echo "Validates that the final (randomized) egs dir has the expected format" fi dir=$1 @@ -23,7 +23,7 @@ dir=$1 # Note: the .ark files are not actually consumed directly downstream (only via # the top-level .scp files), but we check them anyway for now. for f in $dir/train.1.scp $dir/info.txt \ - $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp}; do + $dir/heldout_subset.scp $dir/train_subset.scp; do if ! [ -f $f -a -s $f ]; then echo "$0: expected file $f to exist and be nonempty." exit 1 @@ -31,7 +31,7 @@ for f in $dir/train.1.scp $dir/info.txt \ done -if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "ranodmized_chaina_egs" ]; then +if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "randomized_chaina_egs" ]; then grep dir_type $dir/info.txt echo "$0: dir_type should be randomized_chaina_egs in $dir/info.txt" exit 1 @@ -46,7 +46,7 @@ if [ -z "$langs" ]; then fi for lang in $langs; do - for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do + for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst} $dir/info_${lang}.txt; do if ! [ -f $f -a -s $f ]; then echo "$0: expected file $f to exist and be nonempty." exit 1 @@ -62,4 +62,4 @@ for i in $(seq $num_scp_files); do done -echo "$0: sucessefully validated randomized egs in $dir" +echo "$0: sucessfully validated randomized egs in $dir" diff --git a/egs/wsj/s5/steps/chaina/validate_raw_egs.sh b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh index c06920d58c5..5e15bc0c897 100755 --- a/egs/wsj/s5/steps/chaina/validate_raw_egs.sh +++ b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh @@ -43,4 +43,4 @@ for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do fi done -echo "$0: sucessefully validated raw egs in $dir" +echo "$0: sucessfully validated raw egs in $dir" diff --git a/src/Makefile b/src/Makefile index 32301e254dd..737a26338ca 100644 --- a/src/Makefile +++ b/src/Makefile @@ -8,14 +8,14 @@ SHELL := /bin/bash SUBDIRS = base matrix util feat tree gmm transform \ fstext hmm lm decoder lat kws cudamatrix adapt nnet \ bin fstbin gmmbin fgmmbin featbin \ - nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin + nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 nnet3a rnnlm chain nnet3bin nnet2bin kwsbin \ + ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin nnet3abin MEMTESTDIRS = base matrix util feat tree gmm transform \ - fstext hmm lm decoder lat nnet kws chain \ + fstext hmm lm decoder lat nnet kws chain nnet3a \ bin fstbin gmmbin fgmmbin featbin \ nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin + ivector ivectorbin online2 online2bin lmbin nnet3abin CUDAMEMTESTDIR = cudamatrix @@ -150,7 +150,7 @@ $(EXT_SUBDIRS) : mklibdir ext_depend ### Dependency list ### # this is necessary for correct parallel compilation #1)The tools depend on all the libraries -bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \ +bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin nnet3abin: \ base matrix util feat tree gmm transform sgmm2 fstext hmm \ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index a3222d2285f..2accefc57fa 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) { "Usage: nnet3-chain-combine [options] ... \n" "\n" "e.g.:\n" - " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n"; + " nnet3-chain-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n"; bool binary_write = true; int32 max_objective_evaluations = 30; @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) { "maximum number of objective evaluations in order to figure " "out the best number of models to combine. It helps to speedup " "if the number of models provided to this binary is quite " - "large (e.g. several hundred)."); + "large (e.g. several hundred)."); po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); po.Register("batchnorm-test-mode", &batchnorm_test_mode, diff --git a/src/featbin/select-feats.cc b/src/featbin/select-feats.cc index c10f0c64ed5..284902f782e 100644 --- a/src/featbin/select-feats.cc +++ b/src/featbin/select-feats.cc @@ -37,7 +37,9 @@ int main(int argc, char *argv[]) { "command cut -f ...\n" "Usage: select-feats \n" " e.g. select-feats 0,24-22,3-12 scp:feats.scp ark,scp:feat-red.ark,feat-red.scp\n" - "See also copy-feats, extract-feature-segments, subset-feats, subsample-feats\n"; + "See also copy-feats, extract-feature-segments, subset-feats, subsample-feats\n" + "Note: this command should no longer be needed in most cases, as it can be done\n" + "more efficiently at the script level; see the script utils/data/limit_feature_dim.sh"; ParseOptions po(usage); diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 523973a72dd..6afbb082b6e 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -196,7 +196,6 @@ NnetChainaTopTrainer::NnetChainaTopTrainer( nnet_(nnet), delta_nnet_(nnet->Copy()), num_minibatches_processed_(0), - max_change_stats_si_(*nnet), max_change_stats_(*nnet) { if (opts_.nnet_config.zero_component_stats) @@ -323,7 +322,7 @@ bool NnetChainaTopTrainer::TrainUnadapted( const CuMatrixBase &input, const NnetComputation &computation, const chain::Supervision &supervision, - BaseFloat model_training_scale, + bool need_model_deriv, const CuVectorBase &deriv_weights, Posterior *posterior, CuMatrix *input_deriv) { @@ -393,14 +392,14 @@ bool NnetChainaTopTrainer::TrainUnadapted( num_minibatches_processed_, tot_weight, tot_objf, tot_l2_term); - if (input_deriv == NULL && model_training_scale == 0.0) + if (input_deriv == NULL && !need_model_deriv) return true; // Freeze the natural gradient. We don't want to update the NG scatter // matrices on this data because we'll next be running the same nnet on the // speaker-adapted version of the same data, and it would violate the // independence assumptions needed for NG to work if we updated them. - if (model_training_scale != 0.0) + if (need_model_deriv) FreezeNaturalGradient(true, delta_nnet_); computer.AcceptInput("output-si", &output_deriv); @@ -414,37 +413,14 @@ bool NnetChainaTopTrainer::TrainUnadapted( if (input_deriv != NULL) computer.GetOutputDestructive("input", input_deriv); - static bool warned_momentum = false; - if (model_training_scale != 1.0 && - nnet_config.momentum != 0.0 && !warned_momentum) { - KALDI_WARN << "Momentum does not interact correctly with top_weight or " - "bottom_weight values. Will not warn again."; - warned_momentum = true; - } - - if (model_training_scale != 0.0) { - // If we're actually training the top model... - - // Update the parameters of nnet. - // Note: normally momentum is 0.0. - bool success = UpdateNnetWithMaxChange( - *delta_nnet_, - nnet_config.max_param_change, - 1.0, - model_training_scale * (1.0 - nnet_config.momentum), - nnet_, &max_change_stats_si_); - - // Un-freeze the natural gradient. + if (need_model_deriv) // Un-freeze the natural gradient. FreezeNaturalGradient(false, delta_nnet_); - if (success) - ScaleNnet(nnet_config.momentum, delta_nnet_); - else - ScaleNnet(0.0, delta_nnet_); - return success; - } else { - return true; - } + // We'll wait until after the adapted pass to call UpdateNnetWithMaxChange(). + // Training the model on these features in between the two passes would leave + // a strong memory of this minibatch in the model's parameters which could + // cause weird effects. + return true; } bool NnetChainaTopTrainer::TrainAdapted( @@ -583,11 +559,15 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, if (opts_.apply_deriv_weights) deriv_weights = deriv_weights_in; + + bool need_unadapted_model_deriv = + (model_training_scale * opts_.unadapted_top_weight) != 0.0; + std::shared_ptr computation_unadapted = GetComputation(structure); bool success = TrainUnadapted( input, *computation_unadapted, supervision, - model_training_scale * opts_.unadapted_top_weight, + need_unadapted_model_deriv, deriv_weights, &post, input_deriv); if (!success) { @@ -595,7 +575,11 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, return false; } - if (input_deriv) { + // Scale down the model derivatives from the unadapted pass. + if (need_unadapted_model_deriv && opts_.unadapted_top_weight != 1.0) + ScaleNnet(opts_.unadapted_top_weight, delta_nnet_); + + if (input_deriv && opts_.unadapted_bottom_weight != 1.0) { // Apply the scale from --unadapted-bottom-weight. We'll supply the other // factor that comes from from the language-specific bottom_weight ("bw") // ito UpdateNnetWithMaxChange() later on when we train the bottom nnet. @@ -725,10 +709,7 @@ bool NnetChainaTopTrainer::PrintTotalStats() const { ans = true; if (output_xent_objf_.PrintTotalStats(lang_name_ + ":output-xent")) ans = true; - KALDI_LOG << "Speaker-independent max-change stats for language " - << lang_name_ << ":"; - max_change_stats_si_.Print(*nnet_); - KALDI_LOG << "Speaker-dependent max-change stats for language " + KALDI_LOG << "Max-change stats for language " << lang_name_ << ":"; max_change_stats_.Print(*nnet_); return ans; diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index 60579eeafff..d6123462432 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -47,7 +47,7 @@ struct NnetChainaTrainingOptions { NnetChainaTrainingOptions(): apply_deriv_weights(true), - unadapted_top_weight(1.0), + unadapted_top_weight(0.5), unadapted_bottom_weight(0.5), bottom_subsampling_factor(1), keep_embedding_context(true), @@ -465,12 +465,9 @@ class NnetChainaTopTrainer { @param [in] supervision The chain supervision object. The nnet output dimensions are worked out from this, as well as using this object to compute the objective function. - @param [in] model_training_scale A scale we'll apply to the parameter - changes and max-change values when taking any step. - This will be the product of the top_weight ("tw") from - the key in the egs, with the value of the - --unadapted-top-weight option. If this is zero, we - won't be training the top model on this eg at all. + @param [in] need_model_deriv True if we are training on this minibatch, + on the unadapted data-- i.e. if we need to compute + the model derivative. @param [in] deriv_weights Weights to be applied to the derivatives for the corresponding frames of the output (order is: first frame for all sequences; second frame for @@ -494,7 +491,7 @@ class NnetChainaTopTrainer { bool TrainUnadapted(const CuMatrixBase &input, const NnetComputation &computation, const chain::Supervision &supervision, - BaseFloat model_training_scale, + bool need_model_deriv, const CuVectorBase &deriv_weights, Posterior *posterior, CuMatrix *input_deriv); @@ -626,9 +623,10 @@ class NnetChainaTopTrainer { // speaker-dependent passes. int32 num_minibatches_processed_; - // stats for max-change (for speaker-independent phases of training) - MaxChangeStats max_change_stats_si_; - // stats for max-change (for speaker-adapted phases of training) + // stats for max-change. This combines both speaker-independent and + // speaker-adapted phases of training, since we compute the gradient summed + // over both passes (with the unadapted derivatives weighted by + // opts_.unadapted_top_weight) before updating the model. MaxChangeStats max_change_stats_; }; diff --git a/src/nnet3abin/nnet3-adapt.cc b/src/nnet3abin/nnet3-adapt.cc index 547a18fb62d..b5613dfa7cd 100644 --- a/src/nnet3abin/nnet3-adapt.cc +++ b/src/nnet3abin/nnet3-adapt.cc @@ -65,7 +65,7 @@ int main(int argc, char *argv[]) { po.Read(argc, argv); - if (po.GetArg(1) == "init" && po.NumArgs() == 3) { + if (po.GetOptArg(1) == "init" && po.NumArgs() == 3) { // This block does the "init" command where the tree.map was not provided. if (num_classes <= 0) KALDI_ERR << "The --num-classes option is required with the " @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) { WriteKaldiObject(transform, transform_wxfilename, binary_write); return 0; - } else if (po.GetArg(1) == "init" && po.NumArgs() == 4) { + } else if (po.GetOptArg(1) == "init" && po.NumArgs() == 4) { // This block does the "init" command where the tree.map was provided. std::string config_rxfilename = po.GetArg(2), tree_map_rxfilename = po.GetArg(3), @@ -109,13 +109,13 @@ int main(int argc, char *argv[]) { ki.Stream(), num_classes); WriteKaldiObject(transform, transform_wxfilename, binary_write); return 0; - } else if (po.GetArg(1) == "info" && po.NumArgs() == 2) { + } else if (po.GetOptArg(1) == "info" && po.NumArgs() == 2) { std::string transform_rxfilename = po.GetArg(2); DifferentiableTransformMapped transform; ReadKaldiObject(transform_rxfilename, &transform); std::cout << transform.Info(); return 0; - } else if (po.GetArg(1) == "copy" && po.NumArgs() == 3) { + } else if (po.GetOptArg(1) == "copy" && po.NumArgs() == 3) { std::string transform_rxfilename = po.GetArg(2), transform_wxfilename = po.GetArg(3); DifferentiableTransformMapped transform; @@ -130,7 +130,7 @@ int main(int argc, char *argv[]) { } WriteKaldiObject(transform, transform_wxfilename, binary_write); return 0; - } else if (po.GetArg(1) == "adapt" && po.NumArgs() == 5) { + } else if (po.GetOptArg(1) == "adapt" && po.NumArgs() == 5) { KALDI_ERR << "The 'adapt' command has not been implemented yet."; return 0; } else { From d827df78a63860a368d6dac8e3c7eb2ba72ce101 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 14 Jan 2019 02:13:03 -0500 Subject: [PATCH 71/87] further fixes --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 31 +++++++------- egs/wsj/s5/steps/chaina/train.sh | 40 ++++++++++++++++--- src/nnet3/nnet-training.h | 2 +- src/nnet3a/nnet-chaina-training.cc | 6 +-- src/nnet3a/nnet-chaina-utils.cc | 6 +-- src/nnet3abin/nnet3-chaina-train.cc | 4 +- 6 files changed, 59 insertions(+), 30 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index b99e35bd576..9641911d76f 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -133,8 +133,8 @@ fi # models. It's a scratch space used by this script but not by # scripts called from here. mkdir -p $dir/configs/ -# $dir/0 will contain the models for iteration zero. -mkdir -p $dir/0/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ l2=0.03 tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" @@ -149,7 +149,7 @@ if [ $stage -le 13 ]; then echo "$0: creating top neural net using the xconfig parser"; cat < $dir/configs/bottom.xconfig - input dim=256 name=input + input dim=40 name=input batchnorm-component name=input-batchnorm @@ -165,7 +165,7 @@ if [ $stage -le 13 ]; then EOF steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ --config-file-out $dir/configs/bottom.config - nnet3-init --srand=$srand $dir/configs/bottom.config $dir/0/bottom.raw + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw fi if [ $stage -le 14 ]; then @@ -175,7 +175,7 @@ if [ $stage -le 14 ]; then # is not really a multilingual setup. # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match # with the dimension of this transform (256). - cat < $dir/0/info.txt < $dir/init/info.txt < $dir/0/info.txt < $dir/init/info.txt < $dir/0/info.txt + --langs="$langs" $dir/init/ > $dir/init/info.txt fi if [ $stage -le 14 ]; then diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh index 3e68e63aa9d..27b13a43c2a 100755 --- a/egs/wsj/s5/steps/chaina/train.sh +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -27,7 +27,7 @@ num_epochs=4.0 # Note: each epoch may actually contain multiple repetitions o # using the --num-repeats option in process_egs.sh # data augmentation # different data shifts (this includes 3 different shifts - # of the data if frame_subsampling_factor=3 (see $dir/0/info.txt) + # of the data if frame_subsampling_factor=3 (see $dir/init/info.txt) num_jobs_initial=1 num_jobs_final=1 @@ -54,6 +54,7 @@ if [ $# != 2 ]; then echo " e.g.: $0 exp/chaina/tdnn1a_sp/egs exp/chaina/tdnn1a_sp" echo "" echo " TODO: more documentation" + exit 1 fi egs_dir=$1 @@ -63,14 +64,16 @@ set -e -u # die on failed command or undefined variable steps/chaina/validate_randomized_egs.sh $egs_dir -for f in $dir/0/info.txt $dir/0/bottom.raw; do - echo "$0: expected file $f to exist" - exit 1 +for f in $dir/init/info.txt $dir/init/bottom.raw; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi done -frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}') -num_scp_files=$(awk '/^num_scp_files/ {print $2}') +frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$dir/init/info.txt) +num_scp_files=$(awk '/^num_scp_files/ {print $2}' <$dir/init/info.txt) steps/chaina/internal/get_train_schedule.py \ --frame-subsampling-factor=$frame_subsampling_factor \ @@ -84,6 +87,31 @@ steps/chaina/internal/get_train_schedule.py \ --schedule-out=$dir/schedule.txt + +num_iters=$(wc -l <$dir/schedule.txt) +langs=$(awk '/^langs/ { $1=""; print; }' <$dir/0/info.txt) + +mkdir -p $dir/log + + +# Copy models with initial learning rate and dropout options from $dir/init to $dir/0 +mkdir -p $dir/0 +lrate=$(awk ' {if(NR-1==0) { print;exit(0);}}' <$dir/schedule.txt | cut -f 5) +dropout_str=$(awk ' {if(NR-1==0) { print;exit(0);}}' <$dir/schedule.txt | cut -f 4) +run.pl $dir/log/init_bottom_model.log \ + nnet3-copy --learning-rate=$lrate --edits="$dropout_str" $dir/init/bottom.raw $dir/0/bottom.raw +for lang in $langs; do + run.pl $dir/log/init_model_$lang.log \ + nnet3-am-copy --learning-rate=$lrate --edits="$dropout_str" $dir/init/$lang.mdl $dir/0/$lang.mdl +done + + +iter=0 + +echo "exiting early" +exit 0 + + # Note: the .ark files are not actually consumed directly downstream (only via # the top-level .scp files), but we check them anyway for now. for f in $dir/train.scp $dir/info.txt \ diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index 64ec7abc58e..f09649d1506 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -82,7 +82,7 @@ struct NnetTrainerOptions { opts->Register("l2-regularize-factor", &l2_regularize_factor, "Factor that " "affects the strength of l2 regularization on model " "parameters. The primary way to specify this type of " - "l2 regularization is via the 'l2-regularize'" + "l2 regularization is via the 'l2-regularize' " "configuration value at the config-file level. " " --l2-regularize-factor will be multiplied by the component-level " "l2-regularize values and can be used to correct for effects " diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 6afbb082b6e..bb0f023e014 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -310,7 +310,7 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( } } request.outputs[1].has_deriv = !opts_.top_model_test_mode; - request.outputs[1].name = (s.adapted ? "output-xent" : "output-xent-si"); + request.outputs[1].name = (s.adapted ? "output-xent" : "output-si-xent"); request.outputs[1].indexes = request.outputs[0].indexes; std::shared_ptr computation = compiler_.Compile( request); @@ -541,7 +541,7 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, const chain::Supervision &supervision, BaseFloat model_training_scale, CuMatrix *input_deriv) { - KALDI_ASSERT(input.NumRows() != 0 && input.NumRows() % num_sequences != 0); + KALDI_ASSERT(input.NumRows() != 0 && input.NumRows() % num_sequences == 0); int32 frames_per_sequence_in = input.NumRows() / num_sequences, frames_per_sequence_out = supervision.frames_per_sequence; @@ -982,7 +982,7 @@ void NnetChainaTrainer::Train(const std::string &key, &num_input_frames, &num_output_frames, &frame_subsampling_factor, &eg_left_context, &eg_right_context); - KALDI_ASSERT(chunks_per_group % num_sequences == 0); + KALDI_ASSERT(num_sequences % chunks_per_group == 0); int32 num_groups = num_sequences / chunks_per_group; AmNnetSimple *top_am_nnet = models_->GetNnetForLang(lang_name); diff --git a/src/nnet3a/nnet-chaina-utils.cc b/src/nnet3a/nnet-chaina-utils.cc index 2f143e8672d..a83097395de 100644 --- a/src/nnet3a/nnet-chaina-utils.cc +++ b/src/nnet3a/nnet-chaina-utils.cc @@ -61,10 +61,10 @@ void FindChainaExampleStructure(const NnetChainExample &eg, const NnetIo &input_io = eg.inputs[0]; - *first_input_t = - input_io.indexes[0].t; - if (input_io.indexes[0].t != *first_input_t + 1) { + *first_input_t = input_io.indexes[0].t; + if (input_io.indexes[1].t != *first_input_t + 1) { KALDI_ERR << "Input indexes are in the wrong order or not consecutive: " - << input_io.indexes[0].t << " != " << (*first_input_t + 1); + << input_io.indexes[1].t << " != " << (*first_input_t) << " + 1"; } Index last_input_index = input_io.indexes.back(); KALDI_ASSERT(last_input_index.n == *num_sequences - 1); diff --git a/src/nnet3abin/nnet3-chaina-train.cc b/src/nnet3abin/nnet3-chaina-train.cc index d413c7ef0d6..386e9dc9baa 100644 --- a/src/nnet3abin/nnet3-chaina-train.cc +++ b/src/nnet3abin/nnet3-chaina-train.cc @@ -43,8 +43,8 @@ int main(int argc, char *argv[]) { " should contain bottom.raw, and .mdl for each language \n" " should contain .den.fst for each language \n" " should contain .ada for each language \n" - " is a place to where bottom.raw and .raw for each language\n" - " that was seen in the egs, will be written.\n"; + " is a place to where bottom..raw and ..raw for each language\n" + " that was seen in the egs, will be written (for , see the --job-id option).\n"; int32 srand_seed = 0; From 7617a8a28ae11a90eb7a1e91ae82e015e3e33dd0 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 14 Jan 2019 16:27:27 -0500 Subject: [PATCH 72/87] Implement get_train_schedule.py --- .../chaina/internal/get_train_schedule.py | 68 +++++++++++++++++-- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py index 9696cbfe32e..067088db8e4 100755 --- a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py +++ b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright 2019 Johns Hopkins University (author: Daniel Povey) +# Copyright Hossein Hadian # Apache 2.0. @@ -16,8 +17,6 @@ import libs.nnet3.train.common as common_train_lib import libs.common as common_lib - - def get_args(): parser = argparse.ArgumentParser( description="Output training schedule information to be consumed by ../train.py", @@ -52,6 +51,9 @@ def get_args(): (to cover frame-shifted copies of the data), times the value of --num-repeats given to process_egs.sh, times any factor arising from data augmentation.""") + parser.add_argument("--num-repeats", type=float, default=1.0, + help="""The number of repeats...TODO + .""") parser.add_argument("--dropout-schedule", type=str, help="""Use this to specify the dropout schedule (how the dropout probability varies with time, 0 == no dropout). You specify a piecewise @@ -72,8 +74,11 @@ def get_args(): lstm*=0,0.2,0'. More general should precede less general patterns, as they are applied sequentially.""") + parser.add_argument("--num-archives", type=int, default=0, required=True, + help="""The number of repeats...TODO + .""") parser.add_argument("--schedule-out", type=str, required=True, - "Output file containing the training schedule. The output + help="""Output file containing the training schedule. The output is lines, one per training iteration. Each line contains tab-separated fields of the form: @@ -87,11 +92,64 @@ def get_args(): nnet3-am-copy or nnet3-copy; is the actual learning rate on this iteration (the effective learning rate times the num-jobs), and is a space-separated - string containing the frame shifts for each job.") + string containing the frame shifts for each job.""") + + print(sys.argv, file=sys.stderr) + args = parser.parse_args() + + return args + +def get_schedules(args): + num_archives_expanded = args.num_archives * args.frame_subsampling_factor + num_archives_to_process = int(args.num_epochs * num_archives_expanded + * args.num_repeats) + num_archives_processed = 0 + num_iters = ((num_archives_to_process * 2) + // (args.num_jobs_initial + args.num_jobs_final)) + + with open(args.schedule_out, 'w', encoding='latin-1') as ostream: + for iter in range(num_iters): + current_num_jobs = int(0.5 + args.num_jobs_initial + + (args.num_jobs_final - args.num_jobs_initial) + * float(iter) / num_iters) + lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, + num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + + dropout_edit_string = common_train_lib.get_dropout_edit_string( + args.dropout_schedule, + float(num_archives_processed) / num_archives_to_process, + iter) + + frame_shifts = [] + egs = [] + for job in range(1, current_num_jobs + 1): + # k is a zero-based index that we will derive the other indexes from. + k = num_archives_processed + job - 1 + # work out the 1-based archive index. + archive_index = (k % args.num_archives) + 1 + # previous : frame_shift = (k/num_archives) % frame_subsampling_factor + frame_shift = ((archive_index + k // args.num_archives) + % args.frame_subsampling_factor) + frame_shifts.append(str(frame_shift)) + egs.append(str(archive_index)) + + print('{iteration}\t{nj}\t{egs}\t{dropout}\t{lr}\t' + '{shifts}'.format(iteration=iter, nj=current_num_jobs, + egs=' '.join(egs), + dropout=dropout_edit_string, lr=lrate, + shifts=' '.join(frame_shifts)), file=ostream) + + num_archives_processed = num_archives_processed + current_num_jobs def main(): - pass + args = get_args() + get_schedules(args) + if __name__ == "__main__": main() From 95f967b432e793276a4cb97cb7bc4b2d5a43bab7 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 14 Jan 2019 17:04:54 -0500 Subject: [PATCH 73/87] Minor changes --- .../chaina/internal/get_train_schedule.py | 37 ++++++++----------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py index 067088db8e4..cfb809b8ecc 100755 --- a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py +++ b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py @@ -51,9 +51,6 @@ def get_args(): (to cover frame-shifted copies of the data), times the value of --num-repeats given to process_egs.sh, times any factor arising from data augmentation.""") - parser.add_argument("--num-repeats", type=float, default=1.0, - help="""The number of repeats...TODO - .""") parser.add_argument("--dropout-schedule", type=str, help="""Use this to specify the dropout schedule (how the dropout probability varies with time, 0 == no dropout). You specify a piecewise @@ -74,9 +71,8 @@ def get_args(): lstm*=0,0.2,0'. More general should precede less general patterns, as they are applied sequentially.""") - parser.add_argument("--num-archives", type=int, default=0, required=True, - help="""The number of repeats...TODO - .""") + parser.add_argument("--num-scp-files", type=int, default=0, required=True, + help="""The number of .scp files in the egs dir.""") parser.add_argument("--schedule-out", type=str, required=True, help="""Output file containing the training schedule. The output is lines, one per training iteration. Each line contains @@ -100,11 +96,10 @@ def get_args(): return args def get_schedules(args): - num_archives_expanded = args.num_archives * args.frame_subsampling_factor - num_archives_to_process = int(args.num_epochs * num_archives_expanded - * args.num_repeats) - num_archives_processed = 0 - num_iters = ((num_archives_to_process * 2) + num_scp_files_expanded = args.num_scp_files * args.frame_subsampling_factor + num_scp_files_to_process = int(args.num_epochs * num_scp_files_expanded) + num_scp_files_processed = 0 + num_iters = ((num_scp_files_to_process * 2) // (args.num_jobs_initial + args.num_jobs_final)) with open(args.schedule_out, 'w', encoding='latin-1') as ostream: @@ -115,28 +110,28 @@ def get_schedules(args): lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, - num_archives_processed, - num_archives_to_process, + num_scp_files_processed, + num_scp_files_to_process, args.initial_effective_lrate, args.final_effective_lrate) dropout_edit_string = common_train_lib.get_dropout_edit_string( args.dropout_schedule, - float(num_archives_processed) / num_archives_to_process, + float(num_scp_files_processed) / num_scp_files_to_process, iter) frame_shifts = [] egs = [] for job in range(1, current_num_jobs + 1): # k is a zero-based index that we will derive the other indexes from. - k = num_archives_processed + job - 1 - # work out the 1-based archive index. - archive_index = (k % args.num_archives) + 1 - # previous : frame_shift = (k/num_archives) % frame_subsampling_factor - frame_shift = ((archive_index + k // args.num_archives) + k = num_scp_files_processed + job - 1 + # work out the 1-based scp index. + scp_index = (k % args.num_scp_files) + 1 + # previous : frame_shift = (k/num_scp_files) % frame_subsampling_factor + frame_shift = ((scp_index + k // args.num_scp_files) % args.frame_subsampling_factor) frame_shifts.append(str(frame_shift)) - egs.append(str(archive_index)) + egs.append(str(scp_index)) print('{iteration}\t{nj}\t{egs}\t{dropout}\t{lr}\t' '{shifts}'.format(iteration=iter, nj=current_num_jobs, @@ -144,7 +139,7 @@ def get_schedules(args): dropout=dropout_edit_string, lr=lrate, shifts=' '.join(frame_shifts)), file=ostream) - num_archives_processed = num_archives_processed + current_num_jobs + num_scp_files_processed = num_scp_files_processed + current_num_jobs def main(): From 42bc1255ffb52c77fa2b13e418dbe6ba542b27d6 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 14 Jan 2019 20:21:54 -0500 Subject: [PATCH 74/87] [src] Various bug fixes --- src/adapt/differentiable-fmllr.cc | 6 ++++-- src/adapt/differentiable-transform-itf.cc | 2 +- src/nnet3a/nnet-chaina-training.cc | 8 +++++--- src/nnet3abin/nnet3-adapt.cc | 12 +++++------- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/adapt/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc index 9d43a465f65..f714c87e033 100644 --- a/src/adapt/differentiable-fmllr.cc +++ b/src/adapt/differentiable-fmllr.cc @@ -141,9 +141,11 @@ void CoreFmllrEstimator::ComputeB() { KALDI_ASSERT(floor > 0.0); MatrixIndexT num_floored = 0; v.ApplyFloor(floor, &num_floored); - if (num_floored > 0.0) + static int num_warned = 100; + if (num_floored > 0.0 && num_warned > 0) KALDI_WARN << num_floored << " out of " << dim - << " singular values floored in L matrix."; + << " singular values floored in L matrix." + << (--num_warned == 0 ? " Will not warn again." : ""); } // f is where we put f(lambda). // f_prime is where we put f'(lambda) (the function-derivative of f w.r.t diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc index 1ce8ae38731..e09c0ca7b2c 100644 --- a/src/adapt/differentiable-transform-itf.cc +++ b/src/adapt/differentiable-transform-itf.cc @@ -161,7 +161,7 @@ void DifferentiableTransformMapped::Write(std::ostream &os, bool binary) const { void DifferentiableTransformMapped::Check() const { KALDI_ASSERT(transform != NULL && (pdf_map.empty() || - *std::max_element(pdf_map.begin(), pdf_map.end()) == + 1 + *std::max_element(pdf_map.begin(), pdf_map.end()) == transform->NumClasses())); } diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index bb0f023e014..4ab33ebc424 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -608,7 +608,8 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, success = TrainAdapted( *computation_adapted, supervision, model_training_scale, deriv_weights, - &adapted_input, &adapted_input_deriv); + &adapted_input, + (input_deriv != NULL ? &adapted_input_deriv : NULL)); num_minibatches_processed_++; if (!success) @@ -750,11 +751,12 @@ NnetComputer* NnetChainaBottomTrainer::Forward( *computation, nnet_, delta_nnet_); computer->AcceptInput("input", input); computer->Run(); - computer->GetOutputDestructive("output", output); if (!train_model) { + computer->GetOutputDestructive("output", output); delete computer; return NULL; } else { + *output = computer->GetOutput("output"); return computer; } } @@ -1004,7 +1006,7 @@ void NnetChainaTrainer::Train(const std::string &key, kUndefined), cu_embedding; eg_input.CopyToMat(&cu_input); - bool train_bottom_nnet = bottom_weight != 1.0; + bool train_bottom_nnet = bottom_weight != 0.0; KALDI_ASSERT(cu_input.NumRows() == num_input_frames * num_sequences); NnetComputer *computer = bottom_trainer_.Forward( diff --git a/src/nnet3abin/nnet3-adapt.cc b/src/nnet3abin/nnet3-adapt.cc index b5613dfa7cd..c39b6556365 100644 --- a/src/nnet3abin/nnet3-adapt.cc +++ b/src/nnet3abin/nnet3-adapt.cc @@ -31,11 +31,8 @@ int main(int argc, char *argv[]) { typedef kaldi::int32 int32; const char *usage = - "Initialize nnet3 neural network from a config file; outputs 'raw' nnet\n" - "without associated information such as transition model and priors.\n" - "Search for examples in scripts in /egs/wsj/s5/steps/nnet3/\n" - "Can also be used to add layers to existing model (provide existing model\n" - "as 1st arg)\n" + "This binary supports various modes that manipulate transform objects for\n" + "the nnet3a/chaina adaptation framework. See patterns below\n" "\n" "Usage: nnet3-adapt [options] init [] \n" " e.g.: nnet3-adapt --num-classes=201 init init.aconfig 0.ada\n" @@ -94,8 +91,9 @@ int main(int argc, char *argv[]) { ReadIntegerVector(ki.Stream(), binary_in, &(transform.pdf_map)); if (transform.pdf_map.empty()) KALDI_ERR << "Expected to be nonempty vector."; - int32 expected_num_classes = *std::max_element(transform.pdf_map.begin(), - transform.pdf_map.end()); + int32 expected_num_classes = + 1 + *std::max_element(transform.pdf_map.begin(), + transform.pdf_map.end()); if (num_classes > 0 && num_classes != expected_num_classes) KALDI_ERR << "The --num-classes given via the option " << num_classes << " differs from the expected value given the tree-map: " From 8cad2401c2bd63ae69c843dfd0b18cefe51368ca Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 16 Jan 2019 16:26:55 -0500 Subject: [PATCH 75/87] [src,scripts,egs] Various fixes; it trains now. --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 1 + .../chaina/internal/get_train_schedule.py | 51 ++++--- egs/wsj/s5/steps/chaina/train.sh | 136 +++++++++++++----- .../libs/nnet3/train/dropout_schedule.py | 49 +++++-- src/nnet3/nnet-chain-example.cc | 29 +++- src/nnet3abin/nnet3-chaina-train.cc | 13 +- 6 files changed, 208 insertions(+), 71 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 9641911d76f..bea31db47a4 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -325,6 +325,7 @@ if [ $stage -le 21 ]; then --stage $train_stage --cmd "$cmd" \ --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \ --dropout-schedule "$dropout_schedule" \ + --num-jobs-initial 2 --num-jobs-final 4 \ $dir/egs $dir fi diff --git a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py index cfb809b8ecc..fa893cfed22 100755 --- a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py +++ b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py @@ -7,7 +7,8 @@ # Apache 2.0. """ This script outputs information about a neural net training schedule, - to be used by ../train.py. + to be used by ../train.sh, in the form of lines that can be selected + and sourced by the shell. """ import argparse @@ -19,7 +20,7 @@ def get_args(): parser = argparse.ArgumentParser( - description="Output training schedule information to be consumed by ../train.py", + description="""Output training schedule information to be consumed by ../train.sh""", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--frame-subsampling-factor", type=int, default=3, @@ -75,20 +76,11 @@ def get_args(): help="""The number of .scp files in the egs dir.""") parser.add_argument("--schedule-out", type=str, required=True, help="""Output file containing the training schedule. The output - is lines, one per training iteration. Each line contains - tab-separated fields of the form: - - where is an iteration index starting from 0, - is the number of jobs for this iteration (between - num-jobs-initial and num-jobs-final), - is a space-separated string containing the - indexes of the .scp files in the egs dir to use for this - iteration (e.g. '1 2 3'), is either the empty - string or something to be passed to the --edits command of - nnet3-am-copy or nnet3-copy; is the - actual learning rate on this iteration (the effective learning - rate times the num-jobs), and is a space-separated - string containing the frame shifts for each job.""") + is lines, one per training iteration. + Each line (one per iteration) is a list of ;-separated commands setting shell + variables. Currently the following variables are set: + iter, num_jobs, inv_num_jobs, scp_indexes, frame_shifts, dropout_opt, lrate. + """) print(sys.argv, file=sys.stderr) args = parser.parse_args() @@ -107,6 +99,12 @@ def get_schedules(args): current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + # as a special case, for iteration zero we use just one job + # regardless of the --num-jobs-initial and --num-jobs-final. This + # is because the model averaging does not work reliably for a + # freshly initialized model. + if iter == 0: + current_num_jobs = 1 lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, @@ -115,7 +113,9 @@ def get_schedules(args): args.initial_effective_lrate, args.final_effective_lrate) - dropout_edit_string = common_train_lib.get_dropout_edit_string( + if args.dropout_schedule == "": + args.dropout_schedule = None + dropout_edit_option = common_train_lib.get_dropout_edit_option( args.dropout_schedule, float(num_scp_files_processed) / num_scp_files_to_process, iter) @@ -130,15 +130,22 @@ def get_schedules(args): # previous : frame_shift = (k/num_scp_files) % frame_subsampling_factor frame_shift = ((scp_index + k // args.num_scp_files) % args.frame_subsampling_factor) + + # Instead of frame shifts like [0, 1, 2], we make them more like + # [0, 1, -1]. This is clearer in intent, and keeps the + # supervision starting at frame zero, which IIRC is a + # requirement somewhere in the 'chaina' code. + if frame_shift > (args.frame_subsampling_factor // 2): + frame_shift = frame_shift - args.frame_subsampling_factor + frame_shifts.append(str(frame_shift)) egs.append(str(scp_index)) - print('{iteration}\t{nj}\t{egs}\t{dropout}\t{lr}\t' - '{shifts}'.format(iteration=iter, nj=current_num_jobs, - egs=' '.join(egs), - dropout=dropout_edit_string, lr=lrate, - shifts=' '.join(frame_shifts)), file=ostream) + print("""iter={iter}; num_jobs={nj}; inv_num_jobs={nj_inv}; scp_indexes=(pad {indexes}); frame_shifts=(pad {shifts}); dropout_opt="{opt}"; lrate={lrate}""".format( + iter=iter, nj=current_num_jobs, nj_inv=(1.0 / current_num_jobs), + indexes = ' '.join(egs), shifts=' '.join(frame_shifts), + opt=dropout_edit_option, lrate=lrate), file=ostream) num_scp_files_processed = num_scp_files_processed + current_num_jobs diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh index 27b13a43c2a..c10f75b01f3 100755 --- a/egs/wsj/s5/steps/chaina/train.sh +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -14,7 +14,8 @@ memory_compression_level=2 # Enables us to use larger minibatch size than we # (--> set to 0 if you have plenty of memory. dropout_schedule= srand=0 -max_param_change=2.0 +max_param_change=1.0 # we use a smaller than normal default (it's normally + # 2.0), because there are two models (bottom and top). use_gpu=yes # can be "yes", "no", "optional", "wait" common_opts= # Options passed through to nnet3-chaina-train and nnet3-chaina-combine @@ -33,11 +34,20 @@ num_jobs_initial=1 num_jobs_final=1 initial_effective_lrate=0.001 final_effective_lrate=0.0001 -num_groups_per_minibatch=32 # note: if chunks_per_group=4, this would mean 128 - # chunks per minibatch. +groups_per_minibatch=32 # This is how you set the minibatch size. Note: if + # chunks_per_group=4, this would mean 128 chunks per + # minibatch. max_iters_combine=80 max_models_combine=20 +diagnostic_period=5 # Get diagnostics every this-many iterations + +shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the groups + # on each iter. +train=true # use --train false to run only diagnostics. + + + # End configuration section @@ -73,13 +83,14 @@ done frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$dir/init/info.txt) -num_scp_files=$(awk '/^num_scp_files/ {print $2}' <$dir/init/info.txt) +num_scp_files=$(awk '/^num_scp_files/ {print $2}' <$dir/egs/info.txt) steps/chaina/internal/get_train_schedule.py \ --frame-subsampling-factor=$frame_subsampling_factor \ --num-jobs-initial=$num_jobs_initial \ --num-jobs-final=$num_jobs_final \ --num-epochs=$num_epochs \ + --dropout-schedule="$dropout_schedule" \ --num-scp-files=$num_scp_files \ --frame-subsampling-factor=$frame_subsampling_factor \ --initial-effective-lrate=$initial_effective_lrate \ @@ -88,55 +99,116 @@ steps/chaina/internal/get_train_schedule.py \ +if [ "$use_gpu" != "no" ]; then gpu_cmd_opt="--gpu 1"; else gpu_cmd_opt=""; fi + num_iters=$(wc -l <$dir/schedule.txt) +# source the 1st line of schedule.txt in the shell; this sets +# lrate and dropout_opt, among other variables. +. <(head -n 1 $dir/schedule.txt) langs=$(awk '/^langs/ { $1=""; print; }' <$dir/0/info.txt) mkdir -p $dir/log - # Copy models with initial learning rate and dropout options from $dir/init to $dir/0 mkdir -p $dir/0 -lrate=$(awk ' {if(NR-1==0) { print;exit(0);}}' <$dir/schedule.txt | cut -f 5) -dropout_str=$(awk ' {if(NR-1==0) { print;exit(0);}}' <$dir/schedule.txt | cut -f 4) run.pl $dir/log/init_bottom_model.log \ - nnet3-copy --learning-rate=$lrate --edits="$dropout_str" $dir/init/bottom.raw $dir/0/bottom.raw + nnet3-copy --learning-rate=$lrate $dropout_opt $dir/init/bottom.raw $dir/0/bottom.raw for lang in $langs; do run.pl $dir/log/init_model_$lang.log \ - nnet3-am-copy --learning-rate=$lrate --edits="$dropout_str" $dir/init/$lang.mdl $dir/0/$lang.mdl + nnet3-am-copy --learning-rate=$lrate $dropout_opt $dir/init/$lang.mdl $dir/0/$lang.mdl done -iter=0 +x=0 +if [ $stage -gt $x ]; then x=$stage; fi -echo "exiting early" -exit 0 +while [ $x -lt $num_iters ]; do + # Source some variables fromm schedule.txt. The effect will be something + # like the following: + # iter=0; num_jobs=2; inv_num_jobs=0.5; scp_indexes=(pad 1 2); frame_shifts=(pad 1 2); dropout_opt="--edits='set-dropout-proportion name=* proportion=0.0'" lrate=0.002 + . <(grep "^iter=$x;" $dir/schedule.txt) + echo "$0: training, iteration $x, num-jobs is $num_jobs" -# Note: the .ark files are not actually consumed directly downstream (only via -# the top-level .scp files), but we check them anyway for now. -for f in $dir/train.scp $dir/info.txt \ - $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp} \ - $dir/train.1.scp $dir/train.1.ark; do - if ! [ -f $f -a -s $f ]; then - echo "$0: expected file $f to exist and be nonempty." + next_x=$[$x+1] + model_in_dir=$dir/$x + if [ ! -f $model_in_dir/bottom.raw ]; then + echo "$0: expected $model_in_dir/bottom.raw to exist" exit 1 fi -done - - -if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "processed_chaina_egs" ]; then - grep dir_type $dir/info.txt - echo "$0: dir_type should be processed_chaina_egs in $dir/info.txt" - exit 1 -fi + den_fst_dir=$egs_dir/misc + transform_dir=$dir/init + model_out_dir=$dir/${next_x} + + + # for the first 4 iterations, plus every $diagnostic_period iterations, launch + # some diagnostic processes. We don't do this on iteration 0, because + # the batchnorm stats wouldn't be ready + if [ $x -gt 0 ] && [ $[x%diagnostic_period] -eq 0 -o $x -lt 5 ]; then + diagnostic_opts="--bottom-model-test-mode=true --top-model-test-mode=true" + + [ -f $dir/$x/.error_diagnostic ] && rm $dir/$x/.error_diagnostic + for name in train heldout; do + $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.$x.log \ + nnet3-chaina-train $diagnostic_opts --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \ + --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + --print-interval=10 \ + $model_in_dir $den_fst_dir $transform_dir \ + "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/${name}_subset.scp ark:-|" \ + || touch $dir/$x/.error_diagnostic & + done + fi -lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) + if $train; then + if [ -d $dir/$next_x ]; then + echo "$0: removing previous contents of $dir/$next_x" + rm -r $dir/$next_x || exit 1 + fi + mkdir -p $dir/$next_x + + for j in $(seq $num_jobs); do + scp_index=${scp_indexes[$j]} + frame_shift=${frame_shifts[$j]} + + $cmd $gpu_cmd_opt $dir/log/train.$x.$j.log \ + nnet3-chaina-train --job-id=$j --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \ + --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + --print-interval=10 --max-param-change=$max_param_change \ + --l2-regularize-factor=$inv_num_jobs --optimization.memory-compression-level=$memory_compression_level \ + $model_in_dir $den_fst_dir $transform_dir \ + "ark:nnet3-chain-copy-egs --frame-shift=$frame_shift scp:$egs_dir/train.$scp_index.scp ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch ark:- ark:-|" \ + $model_out_dir || touch $dir/$next_x/.error & + done + wait + if [ -f $dir/$next_x/.error ]; then + echo "$0: error detected training on iteration $x" + exit 1 + fi + # First average the bottom models + models=$(for j in $(seq $num_jobs); do echo $dir/$next_x/bottom.$j.raw; done) + run.pl $dir/log/average.$x.log \ + nnet3-average $models - \| \ + nnet3-copy --learning-rate=$lrate $dropout_opt - $dir/$next_x/bottom.raw + rm $models + for lang in $langs; do + models=$dir/$next_x/$lang.*.raw + run.pl $dir/log/average_${lang}.$x.log \ + nnet3-average $models - \| \ + nnet3-am-copy --set-raw-nnet=- --learning-rate=$lrate $dropout_opt $dir/$iter/$lang.mdl $dir/$next_x/$lang.mdl + rm $models + done + fi -for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do - if ! [ -f $f -a -s $f ]; then - echo "$0: expected file $f to exist and be nonempty." + wait + if [ -f $dir/$x/.error_diagnostic ]; then + echo "$0: error detected in diagnostics on iteration $x" exit 1 fi + + # TODO: diagnostics; cleanup + x=$[x+1] done -echo "$0: sucessfully validated processed egs in $dir" + +echo "$0: done" +exit 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py index 0de9074517f..d890f8007e6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py @@ -224,18 +224,21 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): return dropout_proportions -def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): - """Return an nnet3-copy --edits line to modify raw_model_string to - set dropout proportions according to dropout_proportions. - E.g. if _dropout_proportions(dropout_schedule, data_fraction) - returns [('*', 0.625)], this will return the string: - "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'" +def get_dropout_edit_option(dropout_schedule, data_fraction, iter_): + """Return an option to be passed to nnet3-copy (or nnet3-am-copy) + that will set the appropriate dropout proportion. If no dropout + is being used (dropout_schedule is None), returns the empty + string, otherwise returns something like + "--edits='set-dropout-proportion name=* proportion=0.625'" Arguments: dropout_schedule: Value for the --trainer.dropout-schedule option. See help for --trainer.dropout-schedule. See _self_test() for examples. + data_fraction: real number in [0,1] that says how far along + in training we are. + iter_: iteration number (needed for debug printing only) See ReadEditConfig() in nnet3/nnet-utils.h to see how set-dropout-proportion directive works. @@ -259,9 +262,39 @@ def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): if _debug_dropout: logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info)) - return ("""nnet3-copy --edits='{edits}' - - |""".format( - edits=";".join(edit_config_lines))) + return "--edits='{0}'".format(";".join(edit_config_lines)) + + +def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): + """Return an nnet3-copy --edits line to modify raw_model_string to + set dropout proportions according to dropout_proportions. + E.g. if _dropout_proportions(dropout_schedule, data_fraction) + returns [('*', 0.625)], this will return the string: + "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'" + + This is a wrapper of the function get_dropout_edit_option which + gets the --edits option; this function just adds the nnet3-copy + and its arguments. + + Arguments: + dropout_schedule: Value for the --trainer.dropout-schedule option. + See help for --trainer.dropout-schedule. + See _self_test() for examples. + data_fraction: real number in [0,1] that says how far along + in training we are. + iter_: iteration number (needed for debug printing only) + + See ReadEditConfig() in nnet3/nnet-utils.h to see how + set-dropout-proportion directive works. + """ + + edit_option = get_dropout_edit_option(dropout_schedule, data_fraction, iter_) + + if edit_option == "": + return "" + else: + return ("nnet3-copy {0} - - |".format(edit_option)) def _self_test(): """Run self-test. diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index 9196feb5d15..517c63e394e 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -370,6 +370,30 @@ void GetChainComputationRequest(const Nnet &nnet, KALDI_ERR << "No outputs in computation request."; } + +// Returns the frame subsampling factor, which is the difference between the +// first 't' value we encounter in 'indexes', and the next 't' value that is +// different from the first 't'. It will typically be 3. +// This function will crash if it could not figure it out (e.g. because +// 'indexes' was empty or had only one element). +static int32 GetFrameSubsamplingFactor(const std::vector &indexes) { + + auto iter = indexes.begin(), end = indexes.end(); + int32 cur_t_value; + if (iter != end) { + cur_t_value = iter->t; + ++iter; + } + for (; iter != end; ++iter) { + if (iter->t != cur_t_value) { + KALDI_ASSERT(iter->t > cur_t_value); + return iter->t - cur_t_value; + } + } + KALDI_ERR << "Error getting frame subsampling factor"; + return 0; // Shouldn't be reached, this is to avoid compiler warnings. +} + void ShiftChainExampleTimes(int32 frame_shift, const std::vector &exclude_names, NnetChainExample *eg) { @@ -397,10 +421,7 @@ void ShiftChainExampleTimes(int32 frame_shift, sup_end = eg->outputs.end(); for (; sup_iter != sup_end; ++sup_iter) { std::vector &indexes = sup_iter->indexes; - KALDI_ASSERT(indexes.size() >= 2 && indexes[0].n == indexes[1].n && - indexes[0].x == indexes[1].x); - int32 frame_subsampling_factor = indexes[1].t - indexes[0].t; - KALDI_ASSERT(frame_subsampling_factor > 0); + int32 frame_subsampling_factor = GetFrameSubsamplingFactor(indexes); // We need to shift by a multiple of frame_subsampling_factor. // Round to the closest multiple. diff --git a/src/nnet3abin/nnet3-chaina-train.cc b/src/nnet3abin/nnet3-chaina-train.cc index 386e9dc9baa..a42d6f197ad 100644 --- a/src/nnet3abin/nnet3-chaina-train.cc +++ b/src/nnet3abin/nnet3-chaina-train.cc @@ -38,13 +38,15 @@ int main(int argc, char *argv[]) { "use it with a GPU).\n" "\n" "Usage: nnet3-chaina-train [options] \n" - " \n" + " []\n" "\n" " should contain bottom.raw, and .mdl for each language \n" " should contain .den.fst for each language \n" " should contain .ada for each language \n" " is a place to where bottom..raw and ..raw for each language\n" - " that was seen in the egs, will be written (for , see the --job-id option).\n"; + " that was seen in the egs, will be written (for , see the --job-id option).\n" + " If it is not specified, the trained models will not be written (e.g. when you are using\n" + " --bottom-model-test-mode=true --top-model-test-mode=true and only want diagnostics).\n"; int32 srand_seed = 0; @@ -69,7 +71,7 @@ int main(int argc, char *argv[]) { srand(srand_seed); - if (po.NumArgs() != 5) { + if (po.NumArgs() < 4 || po.NumArgs() > 5) { po.PrintUsage(); exit(1); } @@ -84,7 +86,7 @@ int main(int argc, char *argv[]) { den_fst_dir = po.GetArg(2), transform_dir = po.GetArg(3), egs_rspecifier = po.GetArg(4), - model_out_dir = po.GetArg(5); + model_out_dir = po.GetOptArg(5); NnetChainaModels models(chaina_opts.nnet_config.zero_component_stats, chaina_opts.bottom_model_test_mode, @@ -102,7 +104,8 @@ int main(int argc, char *argv[]) { ok = trainer.PrintTotalStats(); } - models.WriteRawModels(model_out_dir, binary_write, job_id); + if (po.NumArgs() == 5) + models.WriteRawModels(model_out_dir, binary_write, job_id); #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); From 52391c25d9625829088fcb03ba5cb43bf09f63d6 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 17 Jan 2019 19:51:27 -0500 Subject: [PATCH 76/87] [scripts] Add missing file --- egs/wsj/s5/steps/chaina/get_model_context.sh | 133 +++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100755 egs/wsj/s5/steps/chaina/get_model_context.sh diff --git a/egs/wsj/s5/steps/chaina/get_model_context.sh b/egs/wsj/s5/steps/chaina/get_model_context.sh new file mode 100755 index 00000000000..42876c8d687 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/get_model_context.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script computes the total left and right context needed for example (eg) +# creation from a set of 'chaina' models. +# See the usage message for more information about input and output formats. + +# Begin configuration section. +frame_subsampling_factor=1 # The total frame subsampling factor of the bottom + # + top model, i.e. the relative difference in + # frame rate between the input of the bottom model + # and the output of the top model. Would normally + # be 3. +bottom_subsampling_factor=1 # The frame subsampling factor of the bottom + # (feature-extracting) model only. Must be a + # divisor of frame_subsampling_factor. Would + # normally be 1 or 3. + +langs=default # the list of languages. This script checks that + # in the dir (first arg to the script), each + # language exists as $lang.mdl, and it warns if + # any model files appear (which might indicate a + # script bug). +# End configuration section + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + cat 1>&2 < +This script works out some acoustic-context-related information, +and writes it, long with the options provided to the script, +to the provided. An example of what +output-info-file> might contain after this script is called, is: +langs default +frame_subsampling_factor 3 +bottom_subsampling_factor 3 +model_left_context 22 +model_right_context 22 + + e.g.: $0 --frame-subsampling-factor 3 --bottom-subsampling-factor 3 + --langs 'default' exp/chaina/tdnn1a_sp/0 exp/chaina/tdnn1a_sp/0/info.txt + + Options: + --frame-subsampling-factor # (default: 1) Total frame subsampling factor of + # both models combined, i.e. ratio of + # frame rate of input features vs. + # alignments and decoding (e.g. 3). + --bottom-subsampling-factor # (default: 1) Controls the frequency at which + # the output of the bottom model is + # evaluated, and the interpretation of frame + # offsets in the top config file. Must be a + # divisor of --frame-subsampling-factor + --langs # The list of languages (must be in quotes, + # to be parsed as a single arg). May be + # 'default' or e.g. 'english french' +EOF + exit 1; +fi + + +dir=$1 +info_file=$2 + +# die on error or undefined variable. +set -e -u + +if [ ! -d $dir ]; then + echo 1>&2 "$0: expected directory $dir to exist" + exit 1 +fi + +if [ -z $langs ]; then + echo 1>&2 "$0: list of languages (--langs option) is empty" + exit 1 +fi + +if ! [ $frame_subsampling_factor -ge 1 ] || \ + ! [ $bottom_subsampling_factor -ge 1 ] || \ + ! [ $[frame_subsampling_factor%bottom_subsampling_factor] -eq 0 ]; then + echo 1>&2 "$0: there was a problem with the options --frame-subsampling-factor=$frame_subsampling_factor --bottom-subsampling-factor=$bottom_subsampling_factor" + exit 1 +fi + +mkdir -p $dir/temp + +if [ ! -s $dir/bottom.raw ]; then + echo 1>&2 "$0: expected file $dir/bottom.raw to exist and be nonempty" + exit 1 +fi + +nnet3-info $dir/bottom.raw > $dir/temp/bottom.info +bottom_left_context=$(grep '^left-context:' $dir/temp/bottom.info | awk '{print $2}') +bottom_right_context=$(grep '^right-context:' $dir/temp/bottom.info | awk '{print $2}') + +max_top_left_context=0 +max_top_right_context=0 + + +for lang in $langs; do + if [ ! -s $dir/$lang.mdl ]; then + echo 1>&2 "$0: expected file $dir/$lang.mdl to exist and be nonempty (check --langs option)" + exit 1 + fi + nnet3-am-info $dir/$lang.mdl > $dir/temp/$lang.info + this_left_context=$(grep '^left-context:' $dir/temp/$lang.info | awk '{print $2}') + this_right_context=$(grep '^right-context:' $dir/temp/$lang.info | awk '{print $2}') + if [ $this_left_context -gt $max_top_left_context ]; then + max_top_left_context=$this_left_context + fi + if [ $this_right_context -gt $max_top_right_context ]; then + max_top_right_context=$this_right_context + fi +done + +left_context=$[bottom_left_context+(max_top_left_context*bottom_subsampling_factor)] +right_context=$[bottom_right_context+(max_top_right_context*bottom_subsampling_factor)] + + +cat >$info_file < Date: Thu, 17 Jan 2019 21:33:30 -0500 Subject: [PATCH 77/87] [scripts] Fixes to bugs found by Gaofeng --- egs/wsj/s5/steps/chaina/get_raw_egs.sh | 2 +- egs/wsj/s5/steps/chaina/train.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/get_raw_egs.sh b/egs/wsj/s5/steps/chaina/get_raw_egs.sh index b637762bb56..b6cd103b263 100755 --- a/egs/wsj/s5/steps/chaina/get_raw_egs.sh +++ b/egs/wsj/s5/steps/chaina/get_raw_egs.sh @@ -125,7 +125,7 @@ latdir=$3 dir=$4 tree=$chaindir/${lang}.tree -trans_mdl=$chaindir/0/${lang}.mdl # contains the transition model and a nnet, but +trans_mdl=$chaindir/init/${lang}.mdl # contains the transition model and a nnet, but # we won't be making use of the nnet part. normalization_fst=$chaindir/den_fsts/${lang}.normalization.fst den_fst=$chaindir/den_fsts/${lang}.den.fst diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh index c10f75b01f3..0c02d6c5f1c 100755 --- a/egs/wsj/s5/steps/chaina/train.sh +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -105,7 +105,7 @@ num_iters=$(wc -l <$dir/schedule.txt) # source the 1st line of schedule.txt in the shell; this sets # lrate and dropout_opt, among other variables. . <(head -n 1 $dir/schedule.txt) -langs=$(awk '/^langs/ { $1=""; print; }' <$dir/0/info.txt) +langs=$(awk '/^langs/ { $1=""; print; }' <$dir/init/info.txt) mkdir -p $dir/log From 5aabf44380c301fa1ba69f0eda575d27d28578e5 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 18 Jan 2019 17:25:36 -0500 Subject: [PATCH 78/87] [src,scripts] Finish some code to estimate target model at end of training --- egs/wsj/s5/steps/chaina/train.sh | 83 ++++++++++++++++++++-- src/adapt/differentiable-fmllr.cc | 27 ++++++- src/adapt/differentiable-fmllr.h | 15 ++-- src/adapt/differentiable-transform-itf.h | 5 ++ src/adapt/differentiable-transform-test.cc | 1 + src/adapt/differentiable-transform.cc | 16 +++++ src/adapt/differentiable-transform.h | 24 ++++--- src/adapt/generic-transform.cc | 16 +++++ src/adapt/generic-transform.h | 5 ++ src/nnet3a/nnet-chaina-training.cc | 79 ++++++++++++++------ src/nnet3a/nnet-chaina-training.h | 50 ++++++++++--- src/nnet3abin/nnet3-adapt.cc | 29 ++++++-- src/nnet3abin/nnet3-chaina-train.cc | 3 +- 13 files changed, 299 insertions(+), 54 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh index 0c02d6c5f1c..91109343858 100755 --- a/egs/wsj/s5/steps/chaina/train.sh +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -102,6 +102,9 @@ steps/chaina/internal/get_train_schedule.py \ if [ "$use_gpu" != "no" ]; then gpu_cmd_opt="--gpu 1"; else gpu_cmd_opt=""; fi num_iters=$(wc -l <$dir/schedule.txt) + +echo "$0: will train for $num_epochs epochs = $num_iters iterations" + # source the 1st line of schedule.txt in the shell; this sets # lrate and dropout_opt, among other variables. . <(head -n 1 $dir/schedule.txt) @@ -145,14 +148,15 @@ while [ $x -lt $num_iters ]; do # some diagnostic processes. We don't do this on iteration 0, because # the batchnorm stats wouldn't be ready if [ $x -gt 0 ] && [ $[x%diagnostic_period] -eq 0 -o $x -lt 5 ]; then - diagnostic_opts="--bottom-model-test-mode=true --top-model-test-mode=true" [ -f $dir/$x/.error_diagnostic ] && rm $dir/$x/.error_diagnostic for name in train heldout; do $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.$x.log \ - nnet3-chaina-train $diagnostic_opts --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \ - --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ - --print-interval=10 \ + nnet3-chaina-train --use-gpu=$use_gpu \ + --bottom-model-test-mode=true --top-model-test-mode=true + --leaky-hmm-coefficient=$leaky_hmm_coefficient \ + --xent-regularize=$xent_regularize \ + --print-interval=10 \ $model_in_dir $den_fst_dir $transform_dir \ "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/${name}_subset.scp ark:-|" \ || touch $dir/$x/.error_diagnostic & @@ -162,7 +166,7 @@ while [ $x -lt $num_iters ]; do if $train; then if [ -d $dir/$next_x ]; then echo "$0: removing previous contents of $dir/$next_x" - rm -r $dir/$next_x || exit 1 + rm -r $dir/$next_x fi mkdir -p $dir/$next_x @@ -209,6 +213,75 @@ while [ $x -lt $num_iters ]; do x=$[x+1] done +# TODO: later we'll have a model combination phase. + +if [ $stage -le $num_iters ] && $train; then + # Now accumulate the class-dependent mean (and variance) stats of the + # adaptation model, which will be needed for decoding. We remove the map that + # had reduced the num-classes from several thousand to (e.g.) 200, because we + # are now estimating the means on a larger set of data and we're not concerned + # about noisy estimates. + mkdir -p $dir/transforms_unmapped + # Note: the plan was to add the option --remove-pdf-map=true to the 'copy' + # command below (to use the full number of pdf-ids as classes in test time), + # but it seemed to degrade the objective function, based on diagnostics. + # We'll look into this later. + for lang in $langs; do + run.pl $dir/log/copy_transform_${lang}.log \ + nnet3-adapt copy $dir/init/${lang}.ada $dir/transforms_unmapped/${lang}.ada + done + if [ -d $dir/final ]; then + echo "$0: removing previous contents of $dir/final" + rm -r $dir/final + fi + mkdir -p $dir/final + den_fst_dir=$egs_dir/misc + + $cmd $gpu_cmd_opt JOB=1:$num_scp_files $dir/log/acc_target_model.JOB.log \ + nnet3-chaina-train --job-id=JOB --use-gpu=$use_gpu \ + --print-interval=10 \ + --bottom-model-test-mode=true --top-model-test-mode=true \ + --adaptation-model-accumulate=true \ + $dir/$num_iters $den_fst_dir $dir/transforms_unmapped \ + "ark:nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size scp:$egs_dir/train.JOB.scp ark:- | nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch ark:- ark:-|" \ + $dir/final + + for lang in $langs; do + stats=$dir/final/${lang}.*.ada + run.pl $dir/log/estimate_target_model_${lang}.log \ + nnet3-adapt estimate $stats $dir/final/${lang}.ada + #rm $stats + done + cp $dir/$num_iters/bottom.raw $dir/$num_iters/*.mdl $dir/final +fi + +if [ $stage -le $[num_iters+1] ]; then + # Accumulate some final diagnostics. The difference with the last iteration's + # diagnostics is that we use test-mode for the adaptation model (i.e. a target + # model computed from all the data, not just one minibatch). + [ -f $dir/final/.error_diagnostic ] && rm $dir/final/.error_diagnostic + for name in train heldout; do + den_fst_dir=$egs_dir/misc + $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.final.log \ + nnet3-chaina-train --use-gpu=$use_gpu \ + --bottom-model-test-mode=true --top-model-test-mode=true \ + --adaptation-test-mode=true \ + --leaky-hmm-coefficient=$leaky_hmm_coefficient \ + --xent-regularize=$xent_regularize \ + --print-interval=10 \ + $dir/final $den_fst_dir $dir/final \ + "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/${name}_subset.scp ark:-|" \ + || touch $dir/final/.error_diagnostic & + done + wait + if [ -f $dir/final/.error_diagnostic ]; then + echo "$0: error getting final diagnostic information" + exit 1 + fi +fi + + +transform_dir=$dir/init echo "$0: done" exit 0 diff --git a/src/adapt/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc index f714c87e033..66bda183cfe 100644 --- a/src/adapt/differentiable-fmllr.cc +++ b/src/adapt/differentiable-fmllr.cc @@ -239,7 +239,9 @@ void CoreFmllrEstimator::Backward(const MatrixBase &A_deriv, GaussianEstimator::GaussianEstimator(int32 num_classes, int32 feature_dim): gamma_(num_classes), m_(num_classes, feature_dim), - v_(num_classes) { + v_(num_classes), + variance_floor_(-1), variance_sharing_weight_(-1) { + // the floor and weight are actually set later on, in Estimate(). KALDI_ASSERT(num_classes > 0 && feature_dim > 0); } @@ -375,6 +377,13 @@ void GaussianEstimator::AccStatsBackward( void GaussianEstimator::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + gamma_.Write(os, binary); + m_.Write(os, binary); + v_.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, variance_floor_); + WriteBasicType(os, binary, variance_sharing_weight_); WriteToken(os, binary, ""); mu_.Write(os, binary); WriteToken(os, binary, ""); @@ -382,8 +391,22 @@ void GaussianEstimator::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); } +void GaussianEstimator::Add(const GaussianEstimator &other) { + gamma_.AddVec(1.0, other.gamma_); + m_.AddMat(1.0, other.m_); + v_.AddVec(1.0, other.v_); +} + + void GaussianEstimator::Read(std::istream &is, bool binary) { - ExpectOneOrTwoTokens(is, binary, "", ""); + ExpectOneOrTwoTokens(is, binary, "", ""); + gamma_.Read(is, binary); + m_.Read(is, binary); + v_.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &variance_floor_); + ReadBasicType(is, binary, &variance_sharing_weight_); + ExpectToken(is, binary, ""); mu_.Read(is, binary); ExpectToken(is, binary, ""); t_.Read(is, binary); diff --git a/src/adapt/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h index db8637a3ded..e2db94102a2 100644 --- a/src/adapt/differentiable-fmllr.h +++ b/src/adapt/differentiable-fmllr.h @@ -348,11 +348,15 @@ class GaussianEstimator { const MatrixBase *feats_deriv); - // Note: the Write() and Read() functions are only designed to write the means - // mu_ and the smoothed variances t_. We'll later modify them to (maybe - // conditionally) write other things if needed. void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); + + // Adds any statistics in gamma_, m_ and v_ from 'other' to *this. + // Used when summing adaptation-model statistics over multiple + // jobs. Requires that '*this' and 'other' have identical + // structure. + void Add(const GaussianEstimator &other); + private: /* Notes on implementation of GaussianEstimator. @@ -418,8 +422,9 @@ class GaussianEstimator { // it's of dimension num_classes. Vector v_; - // variance_floor_ and variance_sharing_weight_ are copies of the corresponding - // variables in class FmllrEstimatorOptions; they are set when Estimate() is called. + // variance_floor_ and variance_sharing_weight_ are copies of the + // corresponding variables in class FmllrEstimatorOptions; they are set when + // Estimate() is called. They are temporaries, not permanent members. BaseFloat variance_floor_; BaseFloat variance_sharing_weight_; diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index fe5240f9618..00c58a076b9 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -261,6 +261,11 @@ class DifferentiableTransform { int32 num_spk, const Posterior &posteriors) = 0; + // Adds any stats accumulated via Accumulate() that are present in 'other' to + // 'this'. Used when summing adaptation-model statistics across multiple + // jobs. + virtual void Add(const DifferentiableTransform &other) = 0; + // To be called after repeated calls to Accumulate(), does any estimation that // is required in training time (normally per-speaker means and possibly // variances. diff --git a/src/adapt/differentiable-transform-test.cc b/src/adapt/differentiable-transform-test.cc index ff76d7c738f..cde695a6ab4 100644 --- a/src/adapt/differentiable-transform-test.cc +++ b/src/adapt/differentiable-transform-test.cc @@ -195,6 +195,7 @@ void TestTraining(DifferentiableTransform *transform) { int32 num_final_iters = transform->NumFinalIterations(); for (int32 i = 0; i < num_final_iters; i++) { transform->Accumulate(i, input_feats, num_chunks, num_spk, post); + transform->Add(*transform); // Just check Add() does not crash. transform->Estimate(i); } CuMatrix output_feats2(output_feats.NumRows(), diff --git a/src/adapt/differentiable-transform.cc b/src/adapt/differentiable-transform.cc index e008f35adc7..b1a5f799c96 100644 --- a/src/adapt/differentiable-transform.cc +++ b/src/adapt/differentiable-transform.cc @@ -66,6 +66,12 @@ DifferentiableTransform *FmllrTransform::Copy() const { return new FmllrTransform(*this); } +void FmllrTransform::Add(const DifferentiableTransform &other_in) { + const FmllrTransform *other = dynamic_cast(&other_in); + if (target_model_ && other->target_model_) + target_model_->Add(*(other->target_model_)); +} + void FmllrTransform::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); @@ -94,6 +100,7 @@ void FmllrTransform::Read(std::istream &is, bool binary) { ReadToken(is, binary, &tok); if (tok == "") { target_model_ = new GaussianEstimator(num_classes_, dim_); + target_model_->Read(is, binary); } // else "". ExpectToken(is, binary, ""); } @@ -336,6 +343,14 @@ MeanOnlyTransform::MeanOnlyTransform(const MeanOnlyTransform &other): dim_(other.dim_), target_model_(other.target_model_ == NULL ? NULL : new GaussianEstimator(*other.target_model_)) { } + +void MeanOnlyTransform::Add(const DifferentiableTransform &other_in) { + const MeanOnlyTransform *other = + dynamic_cast(&other_in); + if (target_model_ && other->target_model_) + target_model_->Add(*(other->target_model_)); +} + void MeanOnlyTransform::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); @@ -362,6 +377,7 @@ void MeanOnlyTransform::Read(std::istream &is, bool binary) { ReadToken(is, binary, &tok); if (tok == "") { target_model_ = new GaussianEstimator(num_classes_, dim_); + target_model_->Read(is, binary); } // else "". ExpectToken(is, binary, ""); } diff --git a/src/adapt/differentiable-transform.h b/src/adapt/differentiable-transform.h index c0dfe027969..98d55e6e237 100644 --- a/src/adapt/differentiable-transform.h +++ b/src/adapt/differentiable-transform.h @@ -106,16 +106,20 @@ class FmllrTransform: public DifferentiableTransform { void Read(std::istream &is, bool binary) override; + void Add(const DifferentiableTransform &other) override; + ~FmllrTransform(); private: int32 dim_; FmllrEstimatorOptions fmllr_opts_; - // Note: this target model is only for use in test time. We allocate it the - // first time Accumulate() is called. In training time we estimate it - // minibatch by minibatch (which is why we don't expect to have that many - // classes). At the end of training we'll accumulate stats here in + // Note: this target model is only for consumption in test time; it is + // produced right at the end of training when Accumulate() and Estimate() are + // called. We allocate it the first time Accumulate() is called. In training + // time the corresponding stats are esimated minibatch by minibatch, not via + // this member (which is why we don't expect to have that many classes in + // training time). At the end of training we'll accumulate stats here in // Accumulate(), and Estimate() will estimate it. GaussianEstimator *target_model_; }; @@ -221,14 +225,18 @@ class MeanOnlyTransform: public DifferentiableTransform { void Read(std::istream &is, bool binary) override; + void Add(const DifferentiableTransform &other) override; + ~MeanOnlyTransform(); private: int32 dim_; - // Note: this target model is only for use in test time. We allocate it the - // first time Accumulate() is called. In training time we estimate it - // minibatch by minibatch (which is why we don't expect to have that many - // classes). At the end of training we'll accumulate stats here in + // Note: this target model is only for consumption in test time; it is + // produced right at the end of training when Accumulate() and Estimate() are + // called. We allocate it the first time Accumulate() is called. In training + // time the corresponding stats are esimated minibatch by minibatch, not via + // this member (which is why we don't expect to have that many classes in + // training time). At the end of training we'll accumulate stats here in // Accumulate(), and Estimate() will estimate it. GaussianEstimator *target_model_; }; diff --git a/src/adapt/generic-transform.cc b/src/adapt/generic-transform.cc index 12cbe938d03..24ccc5c396a 100644 --- a/src/adapt/generic-transform.cc +++ b/src/adapt/generic-transform.cc @@ -142,6 +142,14 @@ void SequenceTransform::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); } +void SequenceTransform::Add(const DifferentiableTransform &other_in) { + const SequenceTransform *other = dynamic_cast( + &other_in); + KALDI_ASSERT(transforms_.size() == other->transforms_.size()); + for (size_t i = 0; i < transforms_.size(); i++) + transforms_[i]->Add(*(other->transforms_[i])); +} + int32 SequenceTransform::Dim() const { size_t num_transforms = transforms_.size(); KALDI_ASSERT(num_transforms > 0); @@ -388,6 +396,14 @@ void AppendTransform::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); } +void AppendTransform::Add(const DifferentiableTransform &other_in) { + const AppendTransform *other = dynamic_cast( + &other_in); + KALDI_ASSERT(transforms_.size() == other->transforms_.size()); + for (size_t i = 0; i < transforms_.size(); i++) + transforms_[i]->Add(*(other->transforms_[i])); +} + int32 AppendTransform::Dim() const { size_t num_transforms = transforms_.size(); KALDI_ASSERT(num_transforms > 0); diff --git a/src/adapt/generic-transform.h b/src/adapt/generic-transform.h index 3d910e471cb..602bbabc656 100644 --- a/src/adapt/generic-transform.h +++ b/src/adapt/generic-transform.h @@ -114,6 +114,7 @@ class NoOpTransform: public DifferentiableTransform { void Read(std::istream &is, bool binary) override; + void Add(const DifferentiableTransform &other) override { } private: int32 dim_; }; @@ -200,6 +201,8 @@ class SequenceTransform: public DifferentiableTransform { void Read(std::istream &is, bool binary) override; + void Add(const DifferentiableTransform &other) override; + ~SequenceTransform() override; private: std::vector transforms_; @@ -296,6 +299,8 @@ class AppendTransform: public DifferentiableTransform { void Read(std::istream &is, bool binary) override; ~AppendTransform(); + + void Add(const DifferentiableTransform &other) override; private: std::vector transforms_; }; diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 4ab33ebc424..83e7ba520b5 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -28,12 +28,14 @@ NnetChainaModels::NnetChainaModels( bool zero_component_stats, bool bottom_model_test_mode, bool top_model_test_mode, + bool adaptation_model_accumulate, const std::string &model_dir, const std::string &den_fst_dir, const std::string &transform_dir): zero_component_stats_(zero_component_stats), bottom_model_test_mode_(bottom_model_test_mode), top_model_test_mode_(top_model_test_mode), + adaptation_model_accumulate_(adaptation_model_accumulate), model_dir_(model_dir), den_fst_dir_(den_fst_dir), transform_dir_(transform_dir) { @@ -148,31 +150,45 @@ NnetChainaModels::GetTransformForLang( -void NnetChainaModels::WriteRawModels(const std::string &model_out_dir, - bool binary, - int32 job_id) { +void NnetChainaModels::Write(const std::string &model_out_dir, + bool binary, int32 job_id) { + std::ostringstream ss; if (!bottom_model_test_mode_) { + ss << "bottom nnet and "; std::string bottom_model_name; GetPathname(model_out_dir, "bottom", job_id, "raw", &bottom_model_name); WriteKaldiObject(bottom_nnet_, bottom_model_name, binary); } - std::ostringstream lang_names_ss; - for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { - const std::string &lang_name = iter->first; - lang_names_ss << lang_name << " "; - LanguageInfo *info = iter->second; - { - // we write it as a 'raw' model without the TransitionModel or - // the AmNnetSimple wrapper, since we can reconstruct those parts - // from the previous iter's model. - std::string top_model_name; - GetPathname(model_out_dir, lang_name, job_id, "raw", &top_model_name); - WriteKaldiObject(info->am_nnet.GetNnet(), top_model_name, binary); + if (!top_model_test_mode_) { + ss << "nnets for languages "; + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang_name = iter->first; + ss << lang_name << " "; + LanguageInfo *info = iter->second; + { + // we write it as a 'raw' model without the TransitionModel or + // the AmNnetSimple wrapper, since we can reconstruct those parts + // from the previous iter's model. + std::string top_model_name; + GetPathname(model_out_dir, lang_name, job_id, "raw", &top_model_name); + WriteKaldiObject(info->am_nnet.GetNnet(), top_model_name, binary); + } } } - KALDI_LOG << "Wrote " << (bottom_model_test_mode_ ? "" : " bottom nnet and ") - << "nnets for languages " << lang_names_ss.str() << "to " - << model_out_dir; + if (adaptation_model_accumulate_) { + ss << "adaptation-model stats for languages "; + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang_name = iter->first; + ss << lang_name << " "; + LanguageInfo *info = iter->second; + { + std::string transform_name; + GetPathname(model_out_dir, lang_name, job_id, "ada", &transform_name); + WriteKaldiObject(info->transform, transform_name, binary); + } + } + } + KALDI_LOG << "Wrote " << ss.str() << "to " << model_out_dir; } @@ -198,6 +214,8 @@ NnetChainaTopTrainer::NnetChainaTopTrainer( num_minibatches_processed_(0), max_change_stats_(*nnet) { + config.Check(); + if (opts_.nnet_config.zero_component_stats) ZeroComponentStats(nnet); @@ -593,6 +611,18 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, transform_.transform->NumClasses(), &post_padded); + if (opts_.adaptation_model_accumulate) { + // We will later add a way to handle iteration indexes >0, which is needed + // when the adaptation model contains cascaded transforms, but 0 is the + // normal case. + int32 accumulate_iter = 0; + transform_.transform->Accumulate(accumulate_iter, input, + num_sequences, num_groups, + post_padded); + return true; // We don't be evaluating the adapted version of the top model + } + + structure.adapted = true; std::shared_ptr computation_adapted = GetComputation(structure); @@ -602,8 +632,14 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, adapted_input_deriv; using namespace differentiable_transform; - MinibatchInfoItf *minibatch_info = transform_.transform->TrainingForward( - input, num_sequences, num_groups, post_padded, &adapted_input); + MinibatchInfoItf *minibatch_info = NULL; + if (!opts_.adaptation_test_mode) { + minibatch_info = transform_.transform->TrainingForward( + input, num_sequences, num_groups, post_padded, &adapted_input); + } else { + transform_.transform->TestingForwardBatch( + input, num_sequences, num_groups, post_padded, &adapted_input); + } success = TrainAdapted( *computation_adapted, supervision, @@ -617,10 +653,11 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, if (input_deriv == NULL) delete minibatch_info; - else + else { transform_.transform->TrainingBackward(input, adapted_input_deriv, num_sequences, num_groups, post_padded, minibatch_info, input_deriv); + } return true; } diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index d6123462432..eb9b81d40db 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -44,6 +44,8 @@ struct NnetChainaTrainingOptions { bool keep_embedding_context; bool bottom_model_test_mode; bool top_model_test_mode; + bool adaptation_model_accumulate; + bool adaptation_test_mode; NnetChainaTrainingOptions(): apply_deriv_weights(true), @@ -52,7 +54,9 @@ struct NnetChainaTrainingOptions { bottom_subsampling_factor(1), keep_embedding_context(true), bottom_model_test_mode(false), - top_model_test_mode(false) { } + top_model_test_mode(false), + adaptation_model_accumulate(false), + adaptation_test_mode(false) { } void Register(OptionsItf *opts) { nnet_config.Register(opts); @@ -101,11 +105,31 @@ struct NnetChainaTrainingOptions { "statistics for the top model (to keep the batchnorm " "stats frozen). Setting this to false can be used to " "evaluate train or valid probs."); + opts->Register("adaptation-model-accumulate", &adaptation_model_accumulate, + "Set this to true if you want to accumulate stats for " + "the adaptation model (i.e., its class-dependent means). " + "This will normally be done just once after training the " + "model, and will cause the adaptation objects to be " + "written out to . If this option is given, " + "the speaker adapted pass of the top model, and training " + "of the top or bottom model, will not be done; and we " + "expect --bottom-model-test-mode=true and " + "--top-model-test-mode=true to be set."); + opts->Register("adaptation-test-mode", &adaptation_test_mode, + "If true, use test mode for the adaptation model, which " + "means we'll use previously computed target models " + "rather than ones estimated from the minibatch. Training of " + "the bottom model is currently not supported in this case " + "(and, in any case, is likely undesirable)."); } - void Check() { + void Check() const { KALDI_ASSERT(unadapted_top_weight > 0.0 && unadapted_bottom_weight >= 0.0 && bottom_subsampling_factor > 0); + if (adaptation_model_accumulate) + KALDI_ASSERT(top_model_test_mode && bottom_model_test_mode); + if (adaptation_test_mode) + KALDI_ASSERT(bottom_model_test_mode); } }; @@ -160,6 +184,9 @@ class NnetChainaModels { whether to write the top models in WriteRawModels(), and whether to zero the component stats, set batch-norm test mode, and collapse the model. + @param [in] adaptation_model_accumulate If true, the adaptation + models will be written out instead of the top models. + Expect both test modes above to be true in this case. @param [in] model_dir Directory where we'll find bottom.raw, and .mdl for each language present in the egs (the will be worked out from the key name from @@ -176,6 +203,7 @@ class NnetChainaModels { NnetChainaModels(bool zero_component_stats, bool bottom_model_test_mode, bool top_model_test_mode, + bool adaptation_model_accumulate, const std::string &model_dir, const std::string &den_fst_dir, const std::string &transform_dir); @@ -203,13 +231,17 @@ class NnetChainaModels { differentiable_transform::DifferentiableTransformMapped *GetTransformForLang( const std::string &language_name); - // Writes the files - // /bottom..raw + // Writes out the following files: + // /bottom..raw (if !bottom_model_test_mode) // and, for each language that we accessed, - // /..raw - void WriteRawModels(const std::string &model_out_dir, - bool binary, - int32 job_id); + // /..raw (if !top_model_test_mode) + // /..ada (if adaptation_model_accumulate) + // + // Thus, this writes out any models that we trained. There is no + // corresponding Read() function. + void Write(const std::string &model_out_dir, + bool binary, + int32 job_id); ~NnetChainaModels(); private: @@ -252,6 +284,8 @@ class NnetChainaModels { bool bottom_model_test_mode_; // A copy of the "top-model-test-mode" option in NnetChainaTrainingOptions. bool top_model_test_mode_; + // A copy of the "adaptation-model-accumulate" option in NnetChainaTrainingOptions. + bool adaptation_model_accumulate_; // Directory where models are located. std::string model_dir_; // Directory where denominator FSTs are located. diff --git a/src/nnet3abin/nnet3-adapt.cc b/src/nnet3abin/nnet3-adapt.cc index c39b6556365..de540e49b71 100644 --- a/src/nnet3abin/nnet3-adapt.cc +++ b/src/nnet3abin/nnet3-adapt.cc @@ -35,12 +35,15 @@ int main(int argc, char *argv[]) { "the nnet3a/chaina adaptation framework. See patterns below\n" "\n" "Usage: nnet3-adapt [options] init [] \n" - " e.g.: nnet3-adapt --num-classes=201 init init.aconfig 0.ada\n" + "(e.g.: nnet3-adapt --num-classes=201 init init.aconfig 0.ada)\n" " or: nnet3-adapt init init.aconfig tree.map 0.ada\n" - " or: nnet3-adapt [options] copy \n" - " e.g.: nnet3-adapt copy --binary=false 0.ada 0.txt\n" + " or: nnet3-adapt [options] copy \n" + "(e.g.: nnet3-adapt copy --binary=false 0.ada 0.txt)\n" " or: nnet3-adapt info \n" - " e.g.: nnet3-adapt info 0.ada\n" + "(e.g.: nnet3-adapt info 0.ada\n" + " or: nnet3-adapt estimate ... \n" + " .. which sums stats and calls Estimate(), to get the final class-dependent means... \n" + "(e.g. nnet3-adapt estimate foo/final/default.{1,2,3,4,5,6}.ada foo/final/default.ada\n" " or: nnet3-adapt [options] adapt \n" "\n" "See also: nnet3-chaina-train\n"; @@ -48,6 +51,7 @@ int main(int argc, char *argv[]) { bool binary_write = true; bool remove_pdf_map = false; int32 num_classes = -1; + int32 iter = 0; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); @@ -58,6 +62,8 @@ int main(int argc, char *argv[]) { "For the 'copy' command: if true, the pdf_map will be " "removed so that the transform will be based on " "pdf-ids."); + po.Register("iter", &iter, "Only for the 'estimate' command: iteration " + "of estimation, will always be 0 in most setups."); po.Read(argc, argv); @@ -131,6 +137,21 @@ int main(int argc, char *argv[]) { } else if (po.GetOptArg(1) == "adapt" && po.NumArgs() == 5) { KALDI_ERR << "The 'adapt' command has not been implemented yet."; return 0; + } else if (po.GetOptArg(1) == "estimate" && po.NumArgs() >= 3) { + DifferentiableTransformMapped transform; + std::string transform_rxfilename = po.GetArg(2); + ReadKaldiObject(transform_rxfilename, &transform); + for (int32 i = 3; i < po.NumArgs(); i++) { + std::string other_transform_rxfilename = po.GetArg(i); + DifferentiableTransformMapped other_transform; + ReadKaldiObject(other_transform_rxfilename, &other_transform); + // sum the stats. + transform.transform->Add(*(other_transform.transform)); + } + transform.transform->Estimate(iter); + std::string transform_wxfilename = po.GetArg(po.NumArgs()); + WriteKaldiObject(transform, transform_wxfilename, binary_write); + return 0; } else { po.PrintUsage(); exit(1); diff --git a/src/nnet3abin/nnet3-chaina-train.cc b/src/nnet3abin/nnet3-chaina-train.cc index a42d6f197ad..aade5070ee0 100644 --- a/src/nnet3abin/nnet3-chaina-train.cc +++ b/src/nnet3abin/nnet3-chaina-train.cc @@ -91,6 +91,7 @@ int main(int argc, char *argv[]) { NnetChainaModels models(chaina_opts.nnet_config.zero_component_stats, chaina_opts.bottom_model_test_mode, chaina_opts.top_model_test_mode, + chaina_opts.adaptation_model_accumulate, model_in_dir, den_fst_dir, transform_dir); { @@ -105,7 +106,7 @@ int main(int argc, char *argv[]) { ok = trainer.PrintTotalStats(); } if (po.NumArgs() == 5) - models.WriteRawModels(model_out_dir, binary_write, job_id); + models.Write(model_out_dir, binary_write, job_id); #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); From bb448ffb2d58e5dda31911808d88ee35e3576c0a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 18 Jan 2019 23:44:53 -0500 Subject: [PATCH 79/87] [scripts,egs] Further progress; finish SI decoding --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 29 +- .../s5/local/chaina/tuning/run_tdnn_1b.sh | 482 ++++++++++++++++++ egs/wsj/s5/steps/chaina/compute_embeddings.sh | 86 ++++ egs/wsj/s5/steps/chaina/decode_si.sh | 141 +++++ egs/wsj/s5/steps/chaina/get_model_context.sh | 1 + egs/wsj/s5/steps/chaina/train.sh | 16 +- egs/wsj/s5/steps/diagnostic/analyze_lats.sh | 9 +- egs/wsj/s5/steps/nnet3/compute_output.sh | 1 + egs/wsj/s5/steps/nnet3/decode.sh | 2 +- 9 files changed, 762 insertions(+), 5 deletions(-) create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh create mode 100755 egs/wsj/s5/steps/chaina/compute_embeddings.sh create mode 100755 egs/wsj/s5/steps/chaina/decode_si.sh diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index bea31db47a4..438ce03647f 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -25,7 +25,7 @@ get_egs_stage=-10 chunk_width=140 dropout_schedule='0,0@0.20,0.3@0.50,0' xent_regularize=0.1 -bottom_subsampling_factor=3 +bottom_subsampling_factor=1 # I'll set this to 3 later, 1 is for compatibility with a broken ru. frame_subsampling_factor=3 langs="default" # list of language names @@ -331,6 +331,33 @@ if [ $stage -le 21 ]; then fi +if [ $stage -le 22 ]; then + # Dump the bottom-nnet outputs for this data. + test_sets=dev_clean_2 + for data in $test_sets; do + steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data} + done +fi + +if [ $stage -le 23 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 24 ]; then + # Do the speaker-independent decoding pass + test_sets=dev_clean_2 + for data in $test_sets; do + steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \ + data/${data}_hires $tree_dir/graph_tgsmall\ + $dir/final $dir/data/final/${data} \ + $dir/decode_${data}_tgsmall + done +fi + exit 0; diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..f0917b14ae2 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh @@ -0,0 +1,482 @@ +#!/bin/bash + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 + + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l " + echo "e.g.: steps/chaina/compute_embeddings.sh --nj 8 \\" + echo " data/test_eval92_hires exp/chaina/tdnn1_sp/final exp/nnet3/tdnn1_sp/data/final/test_eval92_hires" + echo "Output will be in /output.scp" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --iter # Iteration of model to decode; default is final." + exit 1; +fi + +data=$1 +model_dir=$2 +dir=$3 + +mkdir -p $dir/log + +# convert $dir to absolute pathname +fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` + +for f in $model_dir/bottom.raw $model_dir/info.txt $data/feats.scp; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + exit 1 + fi +done + + +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + + +bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt) +if ! [ $bottom_subsampling_factor -gt 0 ]; then + echo "$0: error getting bottom_subsampling_factor from $model_dir/info.txt" + exit 1 +fi + + + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/compute.JOB.log \ + nnet3-compute --use-gpu=no \ + --frame-subsampling-factor=$bottom_subsampling_factor \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + $model_dir/bottom.raw scp:$sdata/JOB/feats.scp \ + "ark:|copy-feats --compress=$compress ark:- ark,scp:$dir/output.JOB.ark,$dir/output.JOB.scp" +fi + +for n in $(seq $nj); do + cat $dir/output.$n.scp +done > $dir/output.scp + +exit 0; diff --git a/egs/wsj/s5/steps/chaina/decode_si.sh b/egs/wsj/s5/steps/chaina/decode_si.sh new file mode 100755 index 00000000000..7c9c69b8f37 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/decode_si.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does the speaker-independent pass of decoding with a 'chaina' model, +# and it leaves the embeddings on disk ready to be used in the adapted pass of +# decoding. + + +# Begin configuration section. +stage=1 +nj=4 # number of decoding jobs. +acwt=1.0 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=10.0 # This is typically used in 'chain' systems to scale + # acoustics by 10 so the regular scoring script works OK + # (since it evaluates the LM scale at integer values, + # typically close to 10). We make this the default in + # order to make scoring easier, but you should remember + # when using the lattices, that this has been done. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +lattice_beam=6.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use nnet3-latgen-faster-parallel + +scoring_opts= +skip_diagnostics=false +skip_scoring=false +# we may later add extra-{left,right}-context options, but these might be +# problematic. +extra_left_context=0 +extra_right_context=0 +minimize=false +lang=default +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; +set -e -u + +if [ $# -ne 5 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/chaina/decode.sh --nj 8 \\" + echo " data/test exp/chaina/tdnn1a_sp/graph_bg exp/chaina/tdnn1a_sp/final" + echo " exp/chaina/tdnn1a_sp/data/test exp/chaina/tdnn1a_sp/decode_test_bg" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --lattice-beam # Lattice pruning beam; default 6.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --use-gpu # default: false. If true, we recommend" + echo " # to use large --num-threads as the graph" + echo " # search becomes the limiting factor." + exit 1; +fi + + +data=$1 +graphdir=$2 +model_dir=$3 +embedding_dir=$4 +dir=$5 + + +mkdir -p $dir/log + +for f in $graphdir/HCLG.fst $data/utt2spk $model_dir/$lang.mdl $model_dir/$lang.ada \ + $model_dir/info.txt $embedding_dir/output.scp; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +if [ $num_threads -gt 1 ]; then + thread_string="-parallel --num-threads=$num_threads" + queue_opt="--num-threads $num_threads" +else + thread_string= + queue_opt= +fi + +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$model_dir/info.txt) +bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt) +top_subsampling_factor=$[frame_subsampling_factor/bottom_subsampling_factor] + + +# We need to use the output named 'output-si' from the model, since this the speaker independent +# decoding pass. +model="nnet3-am-copy --edits='remove-output-nodes name=output; rename-node old-name=output-si new-name=output' $model_dir/${lang}.mdl -|" + +if [ $stage -le 1 ]; then + $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string \ + --frame-subsampling-factor=$top_subsampling_factor \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt \ + "$model" \ + $graphdir/HCLG.fst \ + "scp:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp|" \ + "ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + + +if [ $stage -le 2 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" --model $model_dir/${lang}.mdl $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 3 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/wsj/s5/steps/chaina/get_model_context.sh b/egs/wsj/s5/steps/chaina/get_model_context.sh index 42876c8d687..7abf1f6e3b5 100755 --- a/egs/wsj/s5/steps/chaina/get_model_context.sh +++ b/egs/wsj/s5/steps/chaina/get_model_context.sh @@ -124,6 +124,7 @@ right_context=$[bottom_right_context+(max_top_right_context*bottom_subsampling_f cat >$info_file < # Acoustic scale for getting best-path (default: 0.1)" + echo " --iter # default: final; affects model location if --model" + echo " # not specified." + echo " --model # Name of .mdl file (if not specified, defaults" + echo " # to /../.mdl if not specified." echo "e.g.:" echo "$0 data/lang exp/tri4b/decode_dev" echo "This script writes some diagnostics to /log/alignments.log" @@ -31,7 +36,9 @@ fi lang=$1 dir=$2 -model=$dir/../${iter}.mdl +if [ -z $model ]; then + model=$dir/../${iter}.mdl +fi for f in $lang/words.txt $model $dir/lat.1.gz $dir/num_jobs; do [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; diff --git a/egs/wsj/s5/steps/nnet3/compute_output.sh b/egs/wsj/s5/steps/nnet3/compute_output.sh index e55f705043b..1f61e97876e 100755 --- a/egs/wsj/s5/steps/nnet3/compute_output.sh +++ b/egs/wsj/s5/steps/nnet3/compute_output.sh @@ -35,6 +35,7 @@ if [ $# -ne 3 ]; then echo "e.g.: steps/nnet3/compute_output.sh --nj 8 \\" echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\" echo " data/test_eval92_hires exp/nnet3/tdnn exp/nnet3/tdnn/output" + echo "Output will be in /output.scp" echo "main options (for others, see top of script file)" echo " --config # config containing options" echo " --nj # number of parallel jobs" diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 14dda2bd457..adf686fa10e 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -19,7 +19,7 @@ min_active=200 ivector_scale=1.0 lattice_beam=8.0 # Beam we use in lattice generation. iter=final -num_threads=1 # if >1, will use gmm-latgen-faster-parallel +num_threads=1 # if >1, will use nnet3-latgen-faster-parallel use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch. # In that case it is recommended to set num-threads to a large # number, e.g. 20 if you have that many free CPU slots on a GPU From d5c96225f7608823b638cb128b8727fbf2122a8c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 19 Jan 2019 16:11:31 -0500 Subject: [PATCH 80/87] [src,scripts,egs] Further progress; 1st actual decoding --- .../s5/local/chaina/tuning/run_tdnn_1b.sh | 21 ++- egs/wsj/s5/steps/chaina/decode.sh | 155 ++++++++++++++++++ egs/wsj/s5/steps/chaina/decode_si.sh | 2 - egs/wsj/s5/steps/chaina/train.sh | 8 +- src/adapt/Makefile | 3 + src/adapt/differentiable-fmllr.h | 8 +- src/adapt/differentiable-transform-itf.cc | 13 ++ src/adapt/differentiable-transform-itf.h | 14 ++ src/adapt/differentiable-transform-test.cc | 5 +- src/adapt/differentiable-transform.cc | 28 ++++ src/adapt/differentiable-transform.h | 10 +- src/adapt/generic-transform.cc | 36 ++++ src/adapt/generic-transform.h | 13 +- src/gmmbin/gmm-est-fmllr.cc | 1 - src/nnet3/Makefile | 2 +- src/nnet3a/nnet-chaina-training.cc | 2 +- src/nnet3a/nnet-chaina-training.h | 118 ++++++------- src/nnet3abin/nnet3-adapt.cc | 82 ++++++++- 18 files changed, 443 insertions(+), 78 deletions(-) create mode 100755 egs/wsj/s5/steps/chaina/decode.sh diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh index f0917b14ae2..547467f5856 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh @@ -1,5 +1,12 @@ #!/bin/bash + +# Not working well yet (WER should be closer to 12%. Need to check for bugs). +#a09:s5: grep WER exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +#%WER 17.08 [ 3440 / 20138, 306 ins, 608 del, 2526 sub ] exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall/wer_17_0.0 +#a09:s5: grep WER exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +#%WER 19.81 [ 3990 / 20138, 389 ins, 657 del, 2944 sub ] exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall.si/wer_12_0.0 + # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -354,10 +361,22 @@ if [ $stage -le 24 ]; then steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \ data/${data}_hires $tree_dir/graph_tgsmall\ $dir/final $dir/data/final/${data} \ - $dir/decode_${data}_tgsmall + $dir/decode_${data}_tgsmall.si + done +fi + +if [ $stage -le 25 ]; then + # Do the speaker-dependent decoding pass + test_sets=dev_clean_2 + for data in $test_sets; do + steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 --stage 2 \ + data/${data}_hires $tree_dir/graph_tgsmall\ + $dir/final $dir/data/final/${data} \ + $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall done fi + exit 0; diff --git a/egs/wsj/s5/steps/chaina/decode.sh b/egs/wsj/s5/steps/chaina/decode.sh new file mode 100755 index 00000000000..df7b627f8c8 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/decode.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does the speaker-dependent pass of decoding with a 'chaina' model, +# including getting the speaker-dependent transforms and dumping lattices. + + +# Begin configuration section. +stage=1 + +acwt=1.0 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=10.0 # This is typically used in 'chain' systems to scale + # acoustics by 10 so the regular scoring script works OK + # (since it evaluates the LM scale at integer values, + # typically close to 10). We make this the default in + # order to make scoring easier, but you should remember + # when using the lattices, that this has been done. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +silence_weight=0.01 # We weight down the posteriors of silence (needs to be tuned). +lattice_beam=6.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use nnet3-latgen-faster-parallel + +scoring_opts= +skip_diagnostics=false +skip_scoring=false +# we may later add extra-{left,right}-context options, but these might be +# problematic. +extra_left_context=0 +extra_right_context=0 +minimize=false +lang=default +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; +set -e -u + +if [ $# -ne 6 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/chaina/decode.sh --nj 8 \\" + echo " data/test exp/chaina/tdnn1a_sp/graph_bg exp/chaina/tdnn1a_sp/final" + echo " exp/chaina/tdnn1a_sp/data/test exp/chaina/tdnn1a_sp/decode_test_bg.si exp/chaina/tdnn1a_sp/decode_test_bg" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --lattice-beam # Lattice pruning beam; default 6.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --use-gpu # default: false. If true, we recommend" + echo " # to use large --num-threads as the graph" + echo " # search becomes the limiting factor." + exit 1; +fi + + +data=$1 +graphdir=$2 +model_dir=$3 +embedding_dir=$4 +si_dir=$5 +dir=$6 + + +mkdir -p $dir/log + +for f in $graphdir/HCLG.fst $data/utt2spk $model_dir/$lang.mdl $model_dir/$lang.ada \ + $model_dir/info.txt $embedding_dir/output.scp $si_dir/lat.1.gz $si_dir/num_jobs; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +nj=$(cat $si_dir/num_jobs) +echo $nj > $dir/num_jobs +sdata=$data/split$nj; +silphonelist=$(cat $graphdir/phones/silence.csl) +frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$model_dir/info.txt) +bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt) +top_subsampling_factor=$[frame_subsampling_factor/bottom_subsampling_factor] + + +## Now get the first-pass fMLLR transforms. +if [ $stage -le 1 ]; then + echo "$0: getting speaker-dependent transforms" + # The --acoustic-scale=0.1 is to reverse the --post-decode-acwt (default: 10) + # that we used when dumping the SI lattices (this was for scoring + # convenience). + $cmd JOB=1:$nj $dir/log/get_transform.JOB.log \ + gunzip -c $si_dir/lat.JOB.gz \| \ + lattice-to-post --acoustic-scale=0.1 ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $model_dir/${lang}.mdl ark:- ark:- \| \ + post-to-pdf-post $model_dir/${lang}.mdl ark:- ark:- \| \ + nnet3-adapt --verbose=2 --frame-subsampling-factor=$top_subsampling_factor \ + get-transforms $model_dir/${lang}.ada ark:$sdata/JOB/spk2utt \ + "scp:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp|" \ + ark,s,cs:- ark:$dir/trans.JOB.ark +fi + +if [ $num_threads -gt 1 ]; then + thread_string="-parallel --num-threads=$num_threads" + queue_opt="--num-threads $num_threads" +else + thread_string= + queue_opt= +fi + +if [ $stage -le 2 ]; then + $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string \ + --frame-subsampling-factor=$top_subsampling_factor \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt \ + $model_dir/${lang}.mdl \ + $graphdir/HCLG.fst \ + "ark:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB.ark scp:- ark:-|" \ + "ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + + +if [ $stage -le 3 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" --model $model_dir/${lang}.mdl $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 4 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/wsj/s5/steps/chaina/decode_si.sh b/egs/wsj/s5/steps/chaina/decode_si.sh index 7c9c69b8f37..f21d82f6278 100755 --- a/egs/wsj/s5/steps/chaina/decode_si.sh +++ b/egs/wsj/s5/steps/chaina/decode_si.sh @@ -4,8 +4,6 @@ # Apache 2.0. # This script does the speaker-independent pass of decoding with a 'chaina' model, -# and it leaves the embeddings on disk ready to be used in the adapted pass of -# decoding. # Begin configuration section. diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh index fd5fa548d7f..30eb18ebf30 100755 --- a/egs/wsj/s5/steps/chaina/train.sh +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -246,7 +246,11 @@ if [ $stage -le $num_iters ] && $train; then mkdir -p $dir/final den_fst_dir=$egs_dir/misc - $cmd $gpu_cmd_opt JOB=1:$num_scp_files $dir/log/acc_target_model.JOB.log \ + num_jobs=$num_scp_files + [ $num_jobs -gt 4 ] && num_jobs=4 # there are so few params to estimate that + # more than 4 jobs would be a waste. + + $cmd $gpu_cmd_opt JOB=1:$num_jobs $dir/log/acc_target_model.JOB.log \ nnet3-chaina-train --job-id=JOB --use-gpu=$use_gpu \ --bottom-subsampling-factor=$bottom_subsampling_factor \ --print-interval=10 \ @@ -260,7 +264,7 @@ if [ $stage -le $num_iters ] && $train; then stats=$dir/final/${lang}.*.ada run.pl $dir/log/estimate_target_model_${lang}.log \ nnet3-adapt estimate $stats $dir/final/${lang}.ada - #rm $stats + rm $stats done cp $dir/$num_iters/bottom.raw $dir/$num_iters/*.mdl $dir/final fi diff --git a/src/adapt/Makefile b/src/adapt/Makefile index 26aa383f333..25c016b4e6d 100644 --- a/src/adapt/Makefile +++ b/src/adapt/Makefile @@ -2,6 +2,9 @@ all: include ../kaldi.mk +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + TESTFILES = differentiable-fmllr-test differentiable-transform-test OBJFILES = differentiable-fmllr.o differentiable-transform-itf.o \ diff --git a/src/adapt/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h index e2db94102a2..c15175752a1 100644 --- a/src/adapt/differentiable-fmllr.h +++ b/src/adapt/differentiable-fmllr.h @@ -546,10 +546,10 @@ class FmllrEstimator { /// Return the linear parameter matrix. Adapted features are /// y_t = A x_t + b. You won't necessarily need to /// call this, you can use ComputeAdaptedFeatures() intead. - const MatrixBase &GetLinearParams() { return A_; } + const MatrixBase &GetLinearParams() const { return A_; } /// Return the bias term b. - const VectorBase &GetBiasParams() { return b_; } + const VectorBase &GetBiasParams() const { return b_; } /// Computes the adapted features y_t = A x_t + b. /// feats (x) and adapted_feats (y) must have the same dimension. Must @@ -816,7 +816,7 @@ class FmllrEstimator { and eventually to the features). Or: if there is only one training sequence, you can use the -o simplified interface: after calling the constructor, + simplified interface: after calling the constructor, - call ForwardCombined() - call BackwardCombined() @@ -862,7 +862,7 @@ class MeanOnlyTransformEstimator { BaseFloat TotalCount() { return gamma_.Sum(); } /// Return the bias term b. - const VectorBase &GetOffset() { return offset_; } + const VectorBase &GetOffset() const { return offset_; } /// Computes the adapted features y_t = x_t + b. /// feats (x) and adapted_feats (y) must have the same dimension. Must diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc index e09c0ca7b2c..98776847f59 100644 --- a/src/adapt/differentiable-transform-itf.cc +++ b/src/adapt/differentiable-transform-itf.cc @@ -106,6 +106,19 @@ void DifferentiableTransform::TestingForwardBatch( this_output(output_cpu.RowData(chunk), frames_per_chunk, dim, output_cpu.Stride() * num_chunks); + /* + // The following testing code was temporarily present to test + // GetTransformAsMatrix().. + if (GetVerboseLevel() >= 3 && RandInt(0, 1) == 0) { + Matrix transform(dim, dim + 1, kUndefined); + this->GetTransformAsMatrix(*stats, &transform); + SubMatrix linear_part(transform, 0, dim, 0, dim); + Vector offset(dim); + offset.CopyColFromMat(transform, dim); + this_output.CopyRowsFromVec(offset); + this_output.AddMatMat(1.0, this_input, kNoTrans, + linear_part, kTrans, 1.0); + } else */ this->TestingForward(this_input, *stats, &this_output); } delete stats; diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index 00c58a076b9..6dd75d3137f 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -291,6 +291,7 @@ class DifferentiableTransform { const SubPosterior &posteriors, SpeakerStatsItf *speaker_stats) const = 0; + // Applies the transformation implied by the statistics in 'speaker_stats' to // 'input', storing in the result in 'output'. You must have done any estimation // procedure that is required first, by calling Estimate() on the speaker-stats @@ -300,6 +301,19 @@ class DifferentiableTransform { const SpeakerStatsItf &speaker_stats, MatrixBase *output) const = 0; + + // This function outputs the speaker-specific transformation in a matrix form + // with an offset, i.e., a matrix of dimension Dim() by Dim() + 1 where + // the last column represents the offset term (the same way Kaldi represents + // LDA and fMLLR transforms as matrices. + // The 'speaker_stats' object must have had Estimate() called on it. + // 'transform' must be of dimension Dim() by Dim() + 1; it may contain + // NaN's at entry. + virtual void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const = 0; + + // TestingForwardBatch() combines GetEmptySpeakerStats(), TestingAccumulate() and // TestingForward(). It has a default implementation. It is a convenience // function that may be useful during training under some circumstances, e.g. diff --git a/src/adapt/differentiable-transform-test.cc b/src/adapt/differentiable-transform-test.cc index cde695a6ab4..8ad9ee7dcfa 100644 --- a/src/adapt/differentiable-transform-test.cc +++ b/src/adapt/differentiable-transform-test.cc @@ -195,7 +195,8 @@ void TestTraining(DifferentiableTransform *transform) { int32 num_final_iters = transform->NumFinalIterations(); for (int32 i = 0; i < num_final_iters; i++) { transform->Accumulate(i, input_feats, num_chunks, num_spk, post); - transform->Add(*transform); // Just check Add() does not crash. + // transform->Add(*transform); // Just check Add() does not crash. + // it does crash but because of AddVec() failing on this == other.. its ok. transform->Estimate(i); } CuMatrix output_feats2(output_feats.NumRows(), @@ -270,7 +271,7 @@ void UnitTestIo() { int main() { using namespace kaldi::differentiable_transform; - + kaldi::SetVerboseLevel(3); for (int32 i = 0; i < 3; i++) { UnitTestReadFromConfig(); UnitTestIo(); diff --git a/src/adapt/differentiable-transform.cc b/src/adapt/differentiable-transform.cc index b1a5f799c96..bcaf356e695 100644 --- a/src/adapt/differentiable-transform.cc +++ b/src/adapt/differentiable-transform.cc @@ -37,6 +37,12 @@ FmllrMinibatchInfo::~FmllrMinibatchInfo() { } +void FmllrSpeakerStats::Estimate() { + BaseFloat objf_impr = estimator.Estimate(); + KALDI_VLOG(1) << "Objective function improvement per frame is " << objf_impr; +} + + int32 FmllrTransform::InitFromConfig( int32 cur_pos, std::vector *config_lines) { @@ -305,6 +311,17 @@ void FmllrTransform::TestingForward( stats->estimator.AdaptFeatures(input, output); } +void FmllrTransform::GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const { + const FmllrSpeakerStats *stats = dynamic_cast( + &speaker_stats); + int32 dim = Dim(); + KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1); + transform->ColRange(0, dim).CopyFromMat(stats->estimator.GetLinearParams()); + transform->CopyColFromVec(stats->estimator.GetBiasParams(), dim); +} + FmllrTransform::~FmllrTransform() { delete target_model_; } @@ -586,6 +603,17 @@ void MeanOnlyTransform::TestingForward( stats->estimator.AdaptFeatures(input, output); } +void MeanOnlyTransform::GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const { + const MeanOnlyTransformSpeakerStats *stats = + dynamic_cast(&speaker_stats); + int32 dim = Dim(); + KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1); + transform->SetUnit(); + transform->CopyColFromVec(stats->estimator.GetOffset(), dim); +} + MeanOnlyTransform::~MeanOnlyTransform() { delete target_model_; } diff --git a/src/adapt/differentiable-transform.h b/src/adapt/differentiable-transform.h index 98d55e6e237..c3abb1bbb96 100644 --- a/src/adapt/differentiable-transform.h +++ b/src/adapt/differentiable-transform.h @@ -94,6 +94,10 @@ class FmllrTransform: public DifferentiableTransform { const SpeakerStatsItf &speaker_stats, MatrixBase *output) const override; + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override; + FmllrTransform(const FmllrTransform &other); FmllrTransform(): target_model_(NULL) { } @@ -148,7 +152,7 @@ class FmllrSpeakerStats: public SpeakerStatsItf { const VectorBase &s): estimator(opts, mu, s) { } - void Estimate() override { estimator.Estimate(); } + void Estimate() override; FmllrEstimator estimator; @@ -213,6 +217,10 @@ class MeanOnlyTransform: public DifferentiableTransform { const SpeakerStatsItf &speaker_stats, MatrixBase *output) const override; + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override; + MeanOnlyTransform(const MeanOnlyTransform &other); MeanOnlyTransform(): target_model_(NULL) { } diff --git a/src/adapt/generic-transform.cc b/src/adapt/generic-transform.cc index 24ccc5c396a..c2c73aefe85 100644 --- a/src/adapt/generic-transform.cc +++ b/src/adapt/generic-transform.cc @@ -310,6 +310,12 @@ void SequenceTransform::TestingForward( transforms_.back()->TestingForward(input, speaker_stats, output); } +void SequenceTransform::GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const { + transforms_.back()->GetTransformAsMatrix(speaker_stats, transform); +} + SequenceMinibatchInfo::~SequenceMinibatchInfo() { for (size_t i = 0; i < info_vec.size(); i++) @@ -565,6 +571,36 @@ void AppendTransform::TestingForward( KALDI_ASSERT(dim_offset == input.NumCols()); } +void AppendTransform::GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const { + int32 dim = Dim(); + KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1); + // first make sure the off-diagonal elements are zero. + transform->SetZero(); + const AppendSpeakerStats *stats = + dynamic_cast(&speaker_stats); + KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() && + "Wrong type of stats supplied to AppendTransform."); + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + SubMatrix transform_part(*transform, dim_offset, this_dim, + dim_offset, this_dim + 1); + transforms_[i]->GetTransformAsMatrix(*(stats->stats[i]), &transform_part); + if (i + 1 < transforms_.size()) { + int32 current_offset_column = dim_offset + this_dim, + required_offset_column = dim; + for (int32 r = dim_offset; r < dim_offset + this_dim; r++) { + (*transform)(r, required_offset_column) = (*transform)(r, current_offset_column); + (*transform)(r, current_offset_column) = BaseFloat(0.0); + } + } + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == Dim()); +} + void AppendSpeakerStats::Estimate() { for (size_t i = 0; i < stats.size(); i++) stats[i]->Estimate(); diff --git a/src/adapt/generic-transform.h b/src/adapt/generic-transform.h index 602bbabc656..9b7933b69af 100644 --- a/src/adapt/generic-transform.h +++ b/src/adapt/generic-transform.h @@ -95,8 +95,11 @@ class NoOpTransform: public DifferentiableTransform { output->CopyFromMat(input); } - void Estimate(int32 final_iter) override { } + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override { transform->SetUnit(); } + void Estimate(int32 final_iter) override { } NoOpTransform(): dim_(-1) { } @@ -187,6 +190,10 @@ class SequenceTransform: public DifferentiableTransform { const SpeakerStatsItf &speaker_stats, MatrixBase *output) const override; + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override; + SequenceTransform(const SequenceTransform &other); SequenceTransform() { } @@ -282,6 +289,10 @@ class AppendTransform: public DifferentiableTransform { const SpeakerStatsItf &speaker_stats, MatrixBase *output) const override; + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override; + void Estimate(int32 final_iter) override; AppendTransform(const AppendTransform &other); diff --git a/src/gmmbin/gmm-est-fmllr.cc b/src/gmmbin/gmm-est-fmllr.cc index 9f8dfd89143..e0702c4fcf8 100644 --- a/src/gmmbin/gmm-est-fmllr.cc +++ b/src/gmmbin/gmm-est-fmllr.cc @@ -195,4 +195,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index 5e67211c3a7..66177559218 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -41,6 +41,6 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \ ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \ ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \ - ../base/kaldi-base.a + ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 83e7ba520b5..7c70e2fb099 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -695,7 +695,7 @@ static inline void ConvertPosteriorElement( } } -void NnetChainaTopTrainer::ConvertPosterior( +void ConvertPosterior( const Posterior &post_at_output, int32 num_sequences, int32 first_input_t, diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index eb9b81d40db..79953ba1faf 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -530,64 +530,6 @@ class NnetChainaTopTrainer { Posterior *posterior, CuMatrix *input_deriv); - /** - Converts the format of the posterior from how it is at the output of the - network to how it is at the input (i.e. in the embedding space). - Basically, this will consist of padding with empty posteriors for the - "context frames", and possibly upsampling the posteriors (by just repeating - each one for, say, 3 frames, if top_subsampling_factor == 3). The - rule we'll use is: copy the posterior from the output frame that - is closest in numbering, rounding down in case of ties (i.e., for even - subsampling factor). - - @param [in] post_at_output The posterior that needs to be padded, - consisting of 'num_sequences' sequences, each with 't' - values starting at zero, at multiples of - 'top_subsampling_factor', and with number of 't' values - determined by: num_frames_out = post_at_output.size() / - num_sequences. The 't' has the larger stride than the - minibatch index 'n', so it's: frame t=0 of all sequences, - then frame t=1*top_subsampling_factor of all sequences, - and so on. - @param [in] num_sequences The number of sequences/chunks - @param [in] first_input_t The first 't' value at the input, for which - we need a posterior for (note: negative 't' values will - get zero posterior). Implicitly, first_output_t = 0. - The number of input frames is worked out as - post_at_input->size() / num_sequences; the 't' values - at the input are assumed to be consecutive. - @param [in] top_subsampling_factor The number of frames with which - 't' values at the output are separated. - @param [in] pdf_map This is either the empty vector (meaning: - the DifferentiableTransform object deals with pdf-ids - directly), or it is a map from pdf-ids to cluster-ids. - This would actually be obtained from build-tree-two-level - after building a two-level tree, and it would be stored - in the .ada object. The actual class labels that - the DifferentiableTransform object deals with, will - be the values stored in 'pfd_map' (i.e. these cluster-ids). - @param [in] num_classes Provided for checking purposes only: the - number of classes that the DifferentiableTransform object - expects. If pdf_map is empty we expect this to be the - same as the number of pdf-ids (and the ints in - post_at_output to be in the range [0, num_classes - 1]). - If pdf_map is nonempty, we expect this to be the same - as the maximum element in pdf_map, plus one. - @param [out] post_at_input The posterior after padding and possibly - subsampling. Should have the correct size but its - elements are expected to be empty at entry. Like - post_at_output, the 't' has the larger stride than - the minibatch-index 'n'. - - */ - void ConvertPosterior(const Posterior &post_at_output, - int32 num_sequences, - int32 first_input_t, - int32 top_subsampling_factor, - const std::vector &pdf_map, - int32 num_classes, - Posterior *post_at_input); - /** Does the adapted pass of training. @param [in] computation The adapted version of the @@ -928,6 +870,66 @@ class NnetChainaTrainer { }; +/** + This utility function, used in training and test-time adaptation code, + converts the format of the posterior from how it is at the output of the + top network to how it is at the input (i.e. in the embedding space). + Basically, this will consist of padding with empty posteriors for the + "context frames", and possibly upsampling the posteriors (by just repeating + each one for, say, 3 frames, if top_subsampling_factor == 3). The + rule we'll use is: copy the posterior from the output frame that + is closest in numbering, rounding down in case of ties (i.e., for even + subsampling factor). + + @param [in] post_at_output The posterior that needs to be padded, + consisting of 'num_sequences' sequences, each with 't' + values starting at zero, at multiples of + 'top_subsampling_factor', and with number of 't' values + determined by: num_frames_out = post_at_output.size() / + num_sequences. The 't' has the larger stride than the + minibatch index 'n', so it's: frame t=0 of all sequences, + then frame t=1*top_subsampling_factor of all sequences, + and so on. + @param [in] num_sequences The number of sequences/chunks + @param [in] first_input_t The first 't' value at the input, for which + we need a posterior for (note: negative 't' values will + get zero posterior). Implicitly, first_output_t = 0. + The number of input frames is worked out as + post_at_input->size() / num_sequences; the 't' values + at the input are assumed to be consecutive. + @param [in] top_subsampling_factor The number of frames with which + 't' values at the output are separated. + @param [in] pdf_map This is either the empty vector (meaning: + the DifferentiableTransform object deals with pdf-ids + directly), or it is a map from pdf-ids to cluster-ids. + This would actually be obtained from build-tree-two-level + after building a two-level tree, and it would be stored + in the .ada object. The actual class labels that + the DifferentiableTransform object deals with, will + be the values stored in 'pfd_map' (i.e. these cluster-ids). + @param [in] num_classes Provided for checking purposes only: the + number of classes that the DifferentiableTransform object + expects. If pdf_map is empty we expect this to be the + same as the number of pdf-ids (and the ints in + post_at_output to be in the range [0, num_classes - 1]). + If pdf_map is nonempty, we expect this to be the same + as the maximum element in pdf_map, plus one. + @param [out] post_at_input The posterior after padding and possibly + subsampling. Should have the correct size but its + elements are expected to be empty at entry. Like + post_at_output, the 't' has the larger stride than + the minibatch-index 'n'. +*/ +void ConvertPosterior(const Posterior &post_at_output, + int32 num_sequences, + int32 first_input_t, + int32 top_subsampling_factor, + const std::vector &pdf_map, + int32 num_classes, + Posterior *post_at_input); + + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3abin/nnet3-adapt.cc b/src/nnet3abin/nnet3-adapt.cc index de540e49b71..8bd6570bf6f 100644 --- a/src/nnet3abin/nnet3-adapt.cc +++ b/src/nnet3abin/nnet3-adapt.cc @@ -22,6 +22,7 @@ #include "nnet3/nnet-nnet.h" #include "hmm/transition-model.h" #include "adapt/differentiable-transform-itf.h" +#include "nnet3a/nnet-chaina-training.h" int main(int argc, char *argv[]) { try { @@ -44,7 +45,12 @@ int main(int argc, char *argv[]) { " or: nnet3-adapt estimate ... \n" " .. which sums stats and calls Estimate(), to get the final class-dependent means... \n" "(e.g. nnet3-adapt estimate foo/final/default.{1,2,3,4,5,6}.ada foo/final/default.ada\n" - " or: nnet3-adapt [options] adapt \n" + " or: nnet3-adapt [options] get-transforms \n" + " ... which estimates and dumps speaker-specific transforms as matrices, which\n" + " could be applied to the features with transform-feats; if you want\n" + " utterance-specific transforms, make spk2utt a one-to-one map.\n" + " is a wspecifier where matrices will be written.\n" + "(e.g.: nnet3-adapt final.ada spk2utt ark:- ark:feats.scp ark:1.trans)\n" "\n" "See also: nnet3-chaina-train\n"; @@ -52,6 +58,7 @@ int main(int argc, char *argv[]) { bool remove_pdf_map = false; int32 num_classes = -1; int32 iter = 0; + int32 frame_subsampling_factor = 1; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); @@ -64,6 +71,12 @@ int main(int argc, char *argv[]) { "pdf-ids."); po.Register("iter", &iter, "Only for the 'estimate' command: iteration " "of estimation, will always be 0 in most setups."); + po.Register("frame-subsampling-factor", &frame_subsampling_factor, + "Factor by which the posteriors we read are subsampled relative " + "to the features (only for the get-transforms command). " + "Will correspond to the top-subsampling-factor," + "which, in chaina scripts, refers to frame_subsampling_factor " + "divided by bottom_subsampling_factor"); po.Read(argc, argv); @@ -134,9 +147,6 @@ int main(int argc, char *argv[]) { } WriteKaldiObject(transform, transform_wxfilename, binary_write); return 0; - } else if (po.GetOptArg(1) == "adapt" && po.NumArgs() == 5) { - KALDI_ERR << "The 'adapt' command has not been implemented yet."; - return 0; } else if (po.GetOptArg(1) == "estimate" && po.NumArgs() >= 3) { DifferentiableTransformMapped transform; std::string transform_rxfilename = po.GetArg(2); @@ -152,6 +162,70 @@ int main(int argc, char *argv[]) { std::string transform_wxfilename = po.GetArg(po.NumArgs()); WriteKaldiObject(transform, transform_wxfilename, binary_write); return 0; + } else if (po.GetOptArg(1) == "get-transforms" && po.NumArgs() == 6) { + std::string transform_rxfilename = po.GetArg(2), + spk2utt_rspecifier = po.GetArg(3), + feats_rspecifier = po.GetArg(4), + post_rspecifier = po.GetArg(5), + transforms_wspecifier = po.GetArg(6); + + DifferentiableTransformMapped transform; + ReadKaldiObject(transform_rxfilename, &transform); + SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); + RandomAccessPosteriorReader post_reader(post_rspecifier); + RandomAccessBaseFloatMatrixReader feature_reader(feats_rspecifier); + BaseFloatMatrixWriter transform_writer(transforms_wspecifier); + int32 num_done = 0, num_no_post = 0, num_other_error = 0; + + for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { + std::unique_ptr stats( + transform.transform->GetEmptySpeakerStats()); + std::string spk = spk2utt_reader.Key(); + bool got_stats = false; + const std::vector &uttlist = spk2utt_reader.Value(); + for (size_t i = 0; i < uttlist.size(); i++) { + std::string utt = uttlist[i]; + if (!feature_reader.HasKey(utt)) { + KALDI_WARN << "Did not find features for utterance " << utt; + num_other_error++; + continue; + } + if (!post_reader.HasKey(utt)) { + KALDI_WARN << "Did not find posteriors for utterance " << utt; + num_no_post++; + continue; + } + const Matrix &feats = feature_reader.Value(utt); + const Posterior &post_in = post_reader.Value(utt); + Posterior post_upsampled(feats.NumRows()); + const Posterior *post_to_use = NULL; + if (frame_subsampling_factor != 1 || !transform.pdf_map.empty()) { + ConvertPosterior( + post_in, 1, 0, frame_subsampling_factor, transform.pdf_map, + transform.transform->NumClasses(), &post_upsampled); + post_to_use = &post_upsampled; + } else { + KALDI_ASSERT(post_in.size() == size_t(feats.NumRows()) && + "Mismatch in posterior vs. feats dimension"); + post_to_use = &post_in; + } + transform.transform->TestingAccumulate(feats, *post_to_use, stats.get()); + got_stats = true; + num_done++; + } + if (!got_stats) { + KALDI_WARN << "Got no stats for speaker " << spk; + } else { + stats->Estimate(); + int32 dim = transform.transform->Dim(); + Matrix transform_mat(dim, dim + 1, kUndefined); + transform.transform->GetTransformAsMatrix(*stats, &transform_mat); + transform_writer.Write(spk, transform_mat); + } + } + KALDI_LOG << "Done " << num_done << " files, " << num_no_post + << " with no posts, " << num_other_error << " with other errors."; + return (num_done != 0 && num_done > (num_no_post + num_other_error)) ? 0 : 1; } else { po.PrintUsage(); exit(1); From 315a5cb35af103d0049c7e9ca4991ba127f01d12 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 19 Jan 2019 16:22:43 -0500 Subject: [PATCH 81/87] [src,egs] Fix more bugs --- .../s5/local/chaina/tuning/run_tdnn_1a.sh | 19 ++++++++++++++++++- .../s5/local/chaina/tuning/run_tdnn_1b.sh | 2 +- src/nnet3a/nnet-chaina-training.cc | 2 +- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh index 438ce03647f..8aa00c0d975 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -1,5 +1,11 @@ #!/bin/bash + +# grep WER exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 21.44 [ 4317 / 20138, 341 ins, 947 del, 3029 sub ] exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 19.72 [ 3971 / 20138, 317 ins, 771 del, 2883 sub ] exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall/wer_17_0.0 + # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -354,7 +360,18 @@ if [ $stage -le 24 ]; then steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \ data/${data}_hires $tree_dir/graph_tgsmall\ $dir/final $dir/data/final/${data} \ - $dir/decode_${data}_tgsmall + $dir/decode_${data}_tgsmall.si + done +fi + +if [ $stage -le 25 ]; then + # Do the speaker-dependent decoding pass + test_sets=dev_clean_2 + for data in $test_sets; do + steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \ + data/${data}_hires $tree_dir/graph_tgsmall\ + $dir/final $dir/data/final/${data} \ + $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall done fi diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh index 547467f5856..59651f24bf2 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh @@ -369,7 +369,7 @@ if [ $stage -le 25 ]; then # Do the speaker-dependent decoding pass test_sets=dev_clean_2 for data in $test_sets; do - steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 --stage 2 \ + steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \ data/${data}_hires $tree_dir/graph_tgsmall\ $dir/final $dir/data/final/${data} \ $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 7c70e2fb099..3465e299ce3 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -708,7 +708,7 @@ void ConvertPosterior( s = top_subsampling_factor; KALDI_ASSERT(input_post_size % num_sequences == 0 && output_post_size % num_sequences == 0 && - input_post_size >= output_post_size * top_subsampling_factor && + input_post_size >= (output_post_size - 1) * top_subsampling_factor && top_subsampling_factor > 0); int32 num_frames_out = output_post_size / num_sequences, num_frames_in = input_post_size / num_sequences, From 9a3e894ed1caaf53ccd75609e6ed723673c3f4cc Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 19 Jan 2019 23:33:41 -0500 Subject: [PATCH 82/87] [scripts,src,egs] Fix various bugs, still tuning. --- .../s5/local/chaina/tuning/run_tdnn_1b.sh | 7 ++--- .../chaina/internal/get_train_schedule.py | 2 +- egs/wsj/s5/steps/chaina/train.sh | 2 +- src/nnet3/nnet-utils.cc | 2 +- src/nnet3/nnet-utils.h | 15 ++++++----- src/nnet3a/nnet-chaina-training.cc | 27 ++++++++++++++++--- src/nnet3a/nnet-chaina-training.h | 7 +++-- 7 files changed, 44 insertions(+), 18 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh index 59651f24bf2..e3990821121 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh @@ -2,10 +2,11 @@ # Not working well yet (WER should be closer to 12%. Need to check for bugs). -#a09:s5: grep WER exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh -#%WER 17.08 [ 3440 / 20138, 306 ins, 608 del, 2526 sub ] exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall/wer_17_0.0 + #a09:s5: grep WER exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh -#%WER 19.81 [ 3990 / 20138, 389 ins, 657 del, 2944 sub ] exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall.si/wer_12_0.0 +#%WER 20.12 [ 4052 / 20138, 394 ins, 569 del, 3089 sub ] exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall.si/wer_10_0.0 +#a09:s5: grep WER exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +#%WER 18.13 [ 3652 / 20138, 297 ins, 613 del, 2742 sub ] exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall/wer_13_0.0 # Set -e here so that we catch if any executable fails immediately set -euo pipefail diff --git a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py index fa893cfed22..c1e9a04179b 100755 --- a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py +++ b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py @@ -117,7 +117,7 @@ def get_schedules(args): args.dropout_schedule = None dropout_edit_option = common_train_lib.get_dropout_edit_option( args.dropout_schedule, - float(num_scp_files_processed) / num_scp_files_to_process, + float(num_scp_files_processed) / max(1, (num_scp_files_to_process - args.num_jobs_final)), iter) frame_shifts = [] diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh index 30eb18ebf30..db41be8c4e9 100755 --- a/egs/wsj/s5/steps/chaina/train.sh +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -90,7 +90,7 @@ if ! [ $[frame_subsampling_factor%bottom_subsampling_factor] == 0 ]; then exit 1 fi -num_scp_files=$(awk '/^num_scp_files/ {print $2}' <$dir/egs/info.txt) +num_scp_files=$(awk '/^num_scp_files/ {print $2}' <$egs_dir/info.txt) steps/chaina/internal/get_train_schedule.py \ --frame-subsampling-factor=$frame_subsampling_factor \ diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 8bc3f12027b..61da1d7f6a9 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -2211,7 +2211,7 @@ void MaxChangeStats::Print(const Nnet &nnet) const { KALDI_LOG << "The global max-change was enforced " << ((100.0 * num_max_change_global_applied) / num_minibatches_processed) - << " \% of the time."; + << "\% of the time."; } diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 60a18f15d84..a5d17eb0437 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -331,13 +331,13 @@ void ReadEditConfig(std::istream &config_file, Nnet *nnet); \code Nnet temp_nnet(delta_nnet); - ScaleNnet(1.0 / max_change_scale, &temp_nnet); - [ Scale down parameters for each component of temp_nnet as needed so - their Euclidean norms do not exceed their per-component max-changes ] + ScaleNnet(scale, &temp_nnet); + [ Scale down parameters for each component of temp_nnet as needed so + their Euclidean norms do not exceed (their per-component max-changes + each multiplied by max_change_scale) ] [ Scale down temp_nnet as needed so its Euclidean norm does not exceed - the global max-change ] - ScaleNnet(max_change_scale, &temp_nnet); // undo the previous scaling. - AddNnet(temp_nnet, scale, nnet); + the global max-change times max_change_scale ] + AddNnet(temp_nnet, 1.0, nnet); \endcode @param [in] delta_nnet The copy of '*nnet' neural network that contains @@ -361,7 +361,8 @@ void ReadEditConfig(std::istream &config_file, Nnet *nnet); max-change, and 'max_change_scale * max_param_change' as the global max-change). @param [in] scale This value, which will normally be 1.0, is a scaling - factor used when adding to 'nnet', applied after any max-changes. + factor used when adding to 'nnet', which is (conceptually) + applied before any max-changes. It is provided for backstitch-related purposes. @param [in,out] nnet The nnet which we add to. @param [out] num_max_change_per_component_applied We add to the elements of diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index 3465e299ce3..9743eb686ef 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -522,12 +522,21 @@ bool NnetChainaTopTrainer::TrainAdapted( if (model_training_scale != 0.0) { // If we're actually training the top model... + // If relevant, add in the part of the gradient that comes from L2 + // regularization. The factor of (1.0 + opts_.unadapted_top_weight) + // is to make it proportional to the magnitude of the derivative. + ApplyL2Regularization( + *nnet_, + supervision.num_sequences * opts_.nnet_config.l2_regularize_factor * + (1.0 + opts_.unadapted_top_weight), + delta_nnet_); + // Update the parameters of nnet. // Note: normally, momentum is 0.0. bool success = UpdateNnetWithMaxChange( *delta_nnet_, nnet_config.max_param_change, - 1.0, + model_training_scale, model_training_scale * (1.0 - nnet_config.momentum), nnet_, &max_change_stats_); @@ -800,6 +809,7 @@ NnetComputer* NnetChainaBottomTrainer::Forward( void NnetChainaBottomTrainer::Backward(BaseFloat model_training_scale, + int32 num_sequences, NnetComputer *computer, CuMatrix *output_deriv) { // if model_training_scale was 0.0, this function should not have been called. @@ -811,13 +821,24 @@ void NnetChainaBottomTrainer::Backward(BaseFloat model_training_scale, const NnetTrainerOptions &nnet_config = opts_.nnet_config; + + // If relevant, add in the part of the gradient that comes from L2 + // regularization. The factor of (1.0 + opts_.unadapted_bottom_weight) + // is to make it proportional to the magnitude of the derivative. + ApplyL2Regularization( + *nnet_, + num_sequences * opts_.nnet_config.l2_regularize_factor * + (1.0 + opts_.unadapted_bottom_weight), + delta_nnet_); + + // we may later provide a way to set a different max-change for the bottom // nnet than on the top nnet. // Note: normally, momentum is 0.0. bool success = UpdateNnetWithMaxChange( *delta_nnet_, nnet_config.max_param_change, - 1.0, + model_training_scale, model_training_scale * (1.0 - nnet_config.momentum), nnet_, &max_change_stats_); @@ -1074,7 +1095,7 @@ void NnetChainaTrainer::Train(const std::string &key, &cu_embedding_deriv : NULL)); if (success && train_bottom_nnet) { - bottom_trainer_.Backward(bottom_weight, computer, + bottom_trainer_.Backward(bottom_weight, num_sequences, computer, &cu_embedding_deriv); } else { delete computer; // if it's NULL, this will do nothing. diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index 79953ba1faf..3a0bbb583ec 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -671,14 +671,16 @@ class NnetChainaBottomTrainer { /** Does the backward pass, which will do model training. This should only be called if the bottom nnet needs to be trained. - @param [in] model_training_scale A scale we'll apply to the parameter changes - and max-change values when taking the step.. This will be + @param [in] model_training_scale A scale we'll apply to the parameter changes, + l2 term and max-change values when taking the step.. This will be referred to elsewhere as bottom_weight, or "bw" when present in keys of egs in scp files; we'll have a separately specifiable weight for the top nnet. If this is zero, we won't be training the top model on this eg at all (and we'll expect 'false' to have been passed in for the 'train_model' arg on the corresponding call to Forward()). + @param [in] num_sequences The number of sequences (chunks) we had in this + minibatch-- needed for the application of l2. @param [in] computer The computer object returned from the forward pass. This function takes ownership of it and will delete it when done with it. @@ -688,6 +690,7 @@ class NnetChainaBottomTrainer { */ void Backward(BaseFloat model_training_scale, + int32 num_sequences, NnetComputer *computer, CuMatrix *output_deriv); From 44816996d4e2a60149c26877ff04a491f1fe8b7e Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 20 Jan 2019 22:25:47 -0500 Subject: [PATCH 83/87] [src,scripts,egs] Refactor the command line options; add more tuning scripts --- .../s5/local/chaina/tuning/run_tdnn_1c.sh | 500 ++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1c2.sh | 508 ++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1c3.sh | 517 ++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1d.sh | 521 ++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1e.sh | 511 ++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1f.sh | 513 ++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1g.sh | 521 ++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1h.sh | 528 +++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1i.sh | 552 +++++++++++++++++ .../s5/local/chaina/tuning/run_tdnn_1j.sh | 559 ++++++++++++++++++ egs/wsj/s5/steps/chaina/train.sh | 9 +- src/nnet3/nnet-diagnostics.h | 18 +- src/nnet3a/nnet-chaina-training.cc | 80 +-- src/nnet3a/nnet-chaina-training.h | 156 +++-- src/nnet3abin/nnet3-chaina-train.cc | 5 +- 15 files changed, 5355 insertions(+), 143 deletions(-) create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c2.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c3.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1d.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1e.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1f.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1g.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1h.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1i.sh create mode 100755 egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1j.sh diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c.sh new file mode 100755 index 00000000000..e4f8d29bbc1 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c.sh @@ -0,0 +1,500 @@ +#!/bin/bash + + +# 1c is a sanity check that the baseline setup is working well; +# we're simply making the transform a NoOpTransform, so the two decoding +# passes should give almost the same results. + + +# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 17.75 [ 3575 / 20138, 362 ins, 484 del, 2729 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_10_0.0 +# a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 17.10 [ 3443 / 20138, 327 ins, 478 del, 2638 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_11_0.0 + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1c # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 + + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2900 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1f). About 0.5% better. +# 1g is as 1c2 but using MeanOnlyTransform. Better!! + + +# grep WER exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.22 [ 3266 / 20138, 297 ins, 463 del, 2506 sub ] exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 14.29 [ 2877 / 20138, 275 ins, 398 del, 2204 sub ] exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5 +# +# vs. the baseline: +# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +# 1c2 is as 1c but changing num-epochs from + +# 1c is a sanity check that the baseline setup is working well; +# we're simply making the transform a NoOpTransform, so the two decoding +# passes should give almost the same results. + + +# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0 +#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0 + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1h # affix for the TDNN directory name +tree_affix=b +train_stage=-10 +get_egs_stage=-10 +common_egs_dir=exp/chaina/tdnn1f_sp/egs + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2900 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1f) +# 1g is as 1c2 but using MeanOnlyTransform. Better!! + +# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5 +# +# vs. the baseline: +# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +# 1c2 is as 1c but changing num-epochs from + +# 1c is a sanity check that the baseline setup is working well; +# we're simply making the transform a NoOpTransform, so the two decoding +# passes should give almost the same results. + + +# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0 +#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0 + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1i # affix for the TDNN directory name +tree_affix=b +train_stage=-10 +get_egs_stage=-10 +common_egs_dir=exp/chaina/tdnn1f_sp/egs + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2900 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1f) +# 1g is as 1c2 but using MeanOnlyTransform. Better!! + +# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5 +# +# vs. the baseline: +# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +# 1c2 is as 1c but changing num-epochs from + +# 1c is a sanity check that the baseline setup is working well; +# we're simply making the transform a NoOpTransform, so the two decoding +# passes should give almost the same results. + + +# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0 +#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0 + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1j # affix for the TDNN directory name +tree_affix=b +train_stage=-10 +get_egs_stage=-10 +common_egs_dir=exp/chaina/tdnn1f_sp/egs + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2900 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l trans_model.Read(ki.Stream(), binary); info->am_nnet.Read(ki.Stream(), binary); - if (zero_component_stats_ && !top_model_test_mode_) { - ZeroComponentStats(&(info->am_nnet.GetNnet())); - } - if (top_model_test_mode_) { - Nnet &nnet = info->am_nnet.GetNnet(); + Nnet &nnet = info->am_nnet.GetNnet(); + if (opts_.nnet_config.zero_component_stats && + !opts_.top.batchnorm_test_mode) + ZeroComponentStats(&nnet); + if (opts_.top.batchnorm_test_mode) SetBatchnormTestMode(true, &nnet); + if (opts_.top.dropout_test_mode) SetDropoutTestMode(true, &nnet); - // The following is for efficiency in evaluating the top nnet, - // it may combine certain component types. - CollapseModel(CollapseModelConfig(), &bottom_nnet_); - } + // The following is for efficiency in evaluating the top nnet, + // it may combine certain component types. + if (!opts_.top.train && opts_.top.batchnorm_test_mode) + CollapseModel(CollapseModelConfig(), &nnet); } ReadFstKaldi(den_fst_filename, &(info->den_fst)); ReadKaldiObject(transform_filename, &(info->transform)); @@ -153,13 +152,13 @@ NnetChainaModels::GetTransformForLang( void NnetChainaModels::Write(const std::string &model_out_dir, bool binary, int32 job_id) { std::ostringstream ss; - if (!bottom_model_test_mode_) { + if (opts_.bottom.train) { ss << "bottom nnet and "; std::string bottom_model_name; GetPathname(model_out_dir, "bottom", job_id, "raw", &bottom_model_name); WriteKaldiObject(bottom_nnet_, bottom_model_name, binary); } - if (!top_model_test_mode_) { + if (opts_.top.train) { ss << "nnets for languages "; for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { const std::string &lang_name = iter->first; @@ -175,7 +174,7 @@ void NnetChainaModels::Write(const std::string &model_out_dir, } } } - if (adaptation_model_accumulate_) { + if (opts_.adaptation_model_accumulate) { ss << "adaptation-model stats for languages "; for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { const std::string &lang_name = iter->first; @@ -294,7 +293,9 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( ComputationRequest request; request.need_model_derivative = s.train_model; - request.store_component_stats = !opts_.top_model_test_mode; + // It's probably harmless to store stats unless we have batchorm components in + // test mode. + request.store_component_stats = !opts_.top.batchnorm_test_mode; request.inputs.resize(1); request.inputs[0].name = "input"; request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); @@ -315,7 +316,7 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( // the second frame of all sequences; and so on. request.outputs.resize(2); request.outputs[0].name = (s.adapted ? "output" : "output-si"); - request.outputs[0].has_deriv = !opts_.top_model_test_mode; + request.outputs[0].has_deriv = opts_.top.train; request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences); int32 t_stride_out = top_subsampling_factor; iter = request.outputs[0].indexes.begin(); @@ -327,7 +328,7 @@ std::shared_ptr NnetChainaTopTrainer::GetComputation( iter->t = t; } } - request.outputs[1].has_deriv = !opts_.top_model_test_mode; + request.outputs[1].has_deriv = opts_.top.train; request.outputs[1].name = (s.adapted ? "output-xent" : "output-si-xent"); request.outputs[1].indexes = request.outputs[0].indexes; std::shared_ptr computation = compiler_.Compile( @@ -528,7 +529,7 @@ bool NnetChainaTopTrainer::TrainAdapted( ApplyL2Regularization( *nnet_, supervision.num_sequences * opts_.nnet_config.l2_regularize_factor * - (1.0 + opts_.unadapted_top_weight), + (1.0 + opts_.top.unadapted_weight), delta_nnet_); // Update the parameters of nnet. @@ -568,6 +569,8 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, const chain::Supervision &supervision, BaseFloat model_training_scale, CuMatrix *input_deriv) { + // note: if opts_.top.train if false, model_training_scale will have been + // already set to zero. KALDI_ASSERT(input.NumRows() != 0 && input.NumRows() % num_sequences == 0); int32 frames_per_sequence_in = input.NumRows() / num_sequences, frames_per_sequence_out = supervision.frames_per_sequence; @@ -588,7 +591,7 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, bool need_unadapted_model_deriv = - (model_training_scale * opts_.unadapted_top_weight) != 0.0; + (model_training_scale * opts_.top.unadapted_weight) != 0.0; std::shared_ptr computation_unadapted = GetComputation(structure); @@ -603,14 +606,14 @@ bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, } // Scale down the model derivatives from the unadapted pass. - if (need_unadapted_model_deriv && opts_.unadapted_top_weight != 1.0) - ScaleNnet(opts_.unadapted_top_weight, delta_nnet_); + if (need_unadapted_model_deriv && opts_.top.unadapted_weight != 1.0) + ScaleNnet(opts_.top.unadapted_weight, delta_nnet_); - if (input_deriv && opts_.unadapted_bottom_weight != 1.0) { + if (input_deriv && opts_.bottom.unadapted_weight != 1.0) { // Apply the scale from --unadapted-bottom-weight. We'll supply the other // factor that comes from from the language-specific bottom_weight ("bw") // ito UpdateNnetWithMaxChange() later on when we train the bottom nnet. - input_deriv->Scale(opts_.unadapted_bottom_weight); + input_deriv->Scale(opts_.bottom.unadapted_weight); } Posterior post_padded(input.NumRows()); @@ -828,7 +831,7 @@ void NnetChainaBottomTrainer::Backward(BaseFloat model_training_scale, ApplyL2Regularization( *nnet_, num_sequences * opts_.nnet_config.l2_regularize_factor * - (1.0 + opts_.unadapted_bottom_weight), + (1.0 + opts_.bottom.unadapted_weight), delta_nnet_); @@ -898,7 +901,7 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( return iter->second; } - if (opts_.bottom_model_test_mode) { + if (!opts_.bottom.train) { KALDI_ASSERT(!s.train_model); } @@ -916,10 +919,9 @@ std::shared_ptr NnetChainaBottomTrainer::GetComputation( ComputationRequest request; request.need_model_derivative = s.train_model; - // If the user supplied the option --train-bottom-model false, then we - // are using test-mode for the batch-norm on the bottom model, and we - // don't want to overwrite the batch-norm stats. - request.store_component_stats = !opts_.bottom_model_test_mode; + // It's probably safe to store component-level stats, unless the + // batchnorm is in test mode. + request.store_component_stats = !opts_.bottom.batchnorm_test_mode; request.inputs.resize(1); request.inputs[0].name = "input"; request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); @@ -1028,9 +1030,9 @@ void NnetChainaTrainer::Train(const std::string &key, ParseFromQueryString(key, "bw", &bottom_weight); KALDI_ASSERT(top_weight >= 0.0 && bottom_weight >= 0.0); - if (opts_.bottom_model_test_mode) + if (!opts_.bottom.train) bottom_weight = 0.0; - if (opts_.top_model_test_mode) + if (!opts_.top.train) top_weight = 0.0; int32 num_sequences, chunks_per_group, first_input_t, diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index 3a0bbb583ec..aa0b40b686e 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -34,48 +34,73 @@ namespace kaldi { namespace nnet3 { + +// This contains the subset of options that you can set for the bottom and the +// top model separately. They are set, for instance, as --bottom.train=false, +// or --top.dropout-test-mode=true. +struct NnetChainaTrainingPerModelOptions { + BaseFloat unadapted_weight; + bool train; + bool dropout_test_mode; + bool batchnorm_test_mode; + + NnetChainaTrainingPerModelOptions(): + unadapted_weight(0.5), + train(true), + dropout_test_mode(false), batchnorm_test_mode(false) { } + + + void Register(OptionsItf *opts) { + opts->Register("unadapted-weight", &unadapted_weight, + "Scale that is applied to the derivatives arising from the " + "unadapted pass of model evaluation, when training " + "Affects how much we prioritize the unadapted " + "features for neural nnet training."); + opts->Register("train", &train, + "Set this to false to disable training for this model."); + opts->Register("dropout-test-mode", &dropout_test_mode, + "Setting this option sets test mode on any dropout components. " + "Will persist in the model written out, if it's being trained."); + opts->Register("batchnorm-test-mode", &batchnorm_test_mode, + "Setting this option sets test mode on any batch-norm " + "(or batch-norm-like) components. "); + } + void Check() const { + KALDI_ASSERT(!(train && batchnorm_test_mode)); + KALDI_ASSERT(unadapted_weight >= 0.0); + } +}; + + struct NnetChainaTrainingOptions { NnetTrainerOptions nnet_config; chain::ChainTrainingOptions chain_config; + NnetChainaTrainingPerModelOptions top; + NnetChainaTrainingPerModelOptions bottom; bool apply_deriv_weights; - BaseFloat unadapted_top_weight; - BaseFloat unadapted_bottom_weight; int32 bottom_subsampling_factor; bool keep_embedding_context; - bool bottom_model_test_mode; - bool top_model_test_mode; bool adaptation_model_accumulate; bool adaptation_test_mode; NnetChainaTrainingOptions(): apply_deriv_weights(true), - unadapted_top_weight(0.5), - unadapted_bottom_weight(0.5), bottom_subsampling_factor(1), keep_embedding_context(true), - bottom_model_test_mode(false), - top_model_test_mode(false), adaptation_model_accumulate(false), adaptation_test_mode(false) { } void Register(OptionsItf *opts) { nnet_config.Register(opts); chain_config.Register(opts); + ParseOptions top_opts("top", opts); + top.Register(&top_opts); // Register with prefix "top". + ParseOptions bottom_opts("bottom", opts); + bottom.Register(&bottom_opts); // Register with prefix "bottom". + opts->Register("apply-deriv-weights", &apply_deriv_weights, "If true, apply the per-frame derivative weights stored with " "the example"); - opts->Register("unadapted-top-weight", &unadapted_top_weight, - "Scale used for the step sizes and max-change values when " - "training the top nnet and evaluating the unadapted output. " - "Affects how strongly the top nnets are trained by the " - "unadapted embeddings. The scale on the adapted branch is " - "implicitly 1.0, but all these numbers also get multiplied " - "by language-specific weights obtained from the egs."); - opts->Register("unadapted-bottom-weight", &unadapted_bottom_weight, - "Scale that is applied to the derivatives arising from the " - "unadapted branch of the top nnets, when training the bottom " - "nnet. Affects how much we prioritize the unadapted " - "features for bottom nnet training."); opts->Register("bottom-subsampling-factor", &bottom_subsampling_factor, "Determines the frequency at which we subsample the " "embeddings from the bottom nnet. Implicitly, the " @@ -91,20 +116,6 @@ struct NnetChainaTrainingOptions { "optional dependencies (for example: if it uses " "StatisticsExtractionComponent, IfDefined(), Failover(), " "etc.)."); - opts->Register("bottom-model-test-mode", &bottom_model_test_mode, - "Set this to true to disable training of the bottom nnet, " - "to use test-mode for any batch-norm or dropout" - "components in it, and to disable the accumulation of " - "statistics for the bottom model (to keep the batchnorm " - "stats frozen). Setting this to false can be used to " - "evaluate train or valid probs."); - opts->Register("top-model-test-mode", &top_model_test_mode, - "Set this to true to disable training of the top nnet, " - "to use test-mode for any batch-norm or dropout" - "components in it, and to disable the accumulation of " - "statistics for the top model (to keep the batchnorm " - "stats frozen). Setting this to false can be used to " - "evaluate train or valid probs."); opts->Register("adaptation-model-accumulate", &adaptation_model_accumulate, "Set this to true if you want to accumulate stats for " "the adaptation model (i.e., its class-dependent means). " @@ -123,13 +134,9 @@ struct NnetChainaTrainingOptions { "(and, in any case, is likely undesirable)."); } void Check() const { - KALDI_ASSERT(unadapted_top_weight > 0.0 && - unadapted_bottom_weight >= 0.0 && - bottom_subsampling_factor > 0); - if (adaptation_model_accumulate) - KALDI_ASSERT(top_model_test_mode && bottom_model_test_mode); - if (adaptation_test_mode) - KALDI_ASSERT(bottom_model_test_mode); + KALDI_ASSERT(bottom_subsampling_factor > 0); + top.Check(); + bottom.Check(); } }; @@ -167,43 +174,23 @@ class NnetChainaModels { required, so languages that are not used by a particular job (e.g. because they were not represented in the egs) will not actually be read. - - @param [in] zero_components stats... The --zero-component-stats option - from NnetChainaTrainingOptions::nnet_config. Note: if - bottom_model_test_mode is true, we won't zero the stats on - the bottom model regardless of this value. - @param [in] bottom_model_test_mode If true, the bottom model will not be - trained (should be set to the same-named option from - NnetChainaTrainingOptions). It's needed to know - whether to write the bottom model in WriteRawModels(), - and whether to zero the component stats, set batch-norm - test mode, and collapse the model. - @param [in] top_model_test_mode If true, the top model will not be - trained (should be set to the same-named option from - NnetChainaTrainingOptions). It's needed to know - whether to write the top models in WriteRawModels(), - and whether to zero the component stats, set batch-norm - test mode, and collapse the model. - @param [in] adaptation_model_accumulate If true, the adaptation - models will be written out instead of the top models. - Expect both test modes above to be true in this case. + @param [in] opts Training options; needed to know which models + we should write out, and whether to set test mode + on models when reading them in. @param [in] model_dir Directory where we'll find bottom.raw, and - .mdl for each language present in the egs - (the will be worked out from the key name from - "...?lang=xxx" in the key when reading the egs, - see ParseFromQueryString() in nnet-chain-utils.h. - @param [in] den_fst_ir Directory where we'll find the denominator - FST .den.fst for each language present in - the egs. + .mdl for each language present in the egs + (the will be worked out from the key name from + "...?lang=xxx" in the key when reading the egs, + see ParseFromQueryString() in nnet-chain-utils.h. + @param [in] den_fst_dir Directory where we'll find the denominator + FST .den.fst for each language present in + the egs. @param [in] transform_dir Directory where we'll find the - transforms (of type DifferentiableTransformItf), - as files .ada for each language present - in the egs. - */ - NnetChainaModels(bool zero_component_stats, - bool bottom_model_test_mode, - bool top_model_test_mode, - bool adaptation_model_accumulate, + transforms (of type DifferentiableTransformItf), + as files .ada for each language present + in the egs. + */ + NnetChainaModels(const NnetChainaTrainingOptions &opts, const std::string &model_dir, const std::string &den_fst_dir, const std::string &transform_dir); @@ -232,10 +219,10 @@ class NnetChainaModels { const std::string &language_name); // Writes out the following files: - // /bottom..raw (if !bottom_model_test_mode) + // /bottom..raw (if opts_.bottom.train) // and, for each language that we accessed, - // /..raw (if !top_model_test_mode) - // /..ada (if adaptation_model_accumulate) + // /..raw (if opts_.top.train) + // /..ada (if opts_.adaptation_model_accumulate) // // Thus, this writes out any models that we trained. There is no // corresponding Read() function. @@ -277,15 +264,8 @@ class NnetChainaModels { // contents from disk) if it does not already exist. LanguageInfo *GetInfoForLang(const std::string &lang); - // True if we are going to call ZeroComponentStats() on models when they are - // read. - bool zero_component_stats_; - // A copy of the "bottom-model-test-mode" option in NnetChainaTrainingOptions. - bool bottom_model_test_mode_; - // A copy of the "top-model-test-mode" option in NnetChainaTrainingOptions. - bool top_model_test_mode_; - // A copy of the "adaptation-model-accumulate" option in NnetChainaTrainingOptions. - bool adaptation_model_accumulate_; + + const NnetChainaTrainingOptions &opts_; // Directory where models are located. std::string model_dir_; // Directory where denominator FSTs are located. diff --git a/src/nnet3abin/nnet3-chaina-train.cc b/src/nnet3abin/nnet3-chaina-train.cc index aade5070ee0..f6f98b6ffd3 100644 --- a/src/nnet3abin/nnet3-chaina-train.cc +++ b/src/nnet3abin/nnet3-chaina-train.cc @@ -88,10 +88,7 @@ int main(int argc, char *argv[]) { egs_rspecifier = po.GetArg(4), model_out_dir = po.GetOptArg(5); - NnetChainaModels models(chaina_opts.nnet_config.zero_component_stats, - chaina_opts.bottom_model_test_mode, - chaina_opts.top_model_test_mode, - chaina_opts.adaptation_model_accumulate, + NnetChainaModels models(chaina_opts, model_in_dir, den_fst_dir, transform_dir); { From d130837f51807cd162683f3248f4228d8cc0b048 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 22 Jan 2019 19:38:52 -0500 Subject: [PATCH 84/87] [egs,scripts,src] Add model combination; and LM rescoring in the egs --- .../s5/local/chaina/tuning/run_tdnn_1j.sh | 151 ++--------- egs/wsj/s5/steps/chaina/train.sh | 37 ++- src/adapt/differentiable-transform-itf.cc | 7 + src/adapt/differentiable-transform-itf.h | 6 +- src/hmm/transition-model.h | 4 +- src/nnet3a/nnet-chaina-training.cc | 148 +++++++++-- src/nnet3a/nnet-chaina-training.h | 55 +++- src/nnet3abin/Makefile | 2 +- src/nnet3abin/nnet3-chaina-combine.cc | 246 ++++++++++++++++++ 9 files changed, 493 insertions(+), 163 deletions(-) create mode 100644 src/nnet3abin/nnet3-chaina-combine.cc diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1j.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1j.sh index 5912667dc42..e575a8275ab 100755 --- a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1j.sh +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1j.sh @@ -11,11 +11,30 @@ #%WER 16.56 [ 3334 / 20138, 289 ins, 470 del, 2575 sub ] exp/chaina/tdnn1j_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 #a09:s5: grep WER exp/chaina/tdnn1j_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh #%WER 12.95 [ 2608 / 20138, 248 ins, 383 del, 1977 sub ] exp/chaina/tdnn1j_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 - +## And a rerun: +# a09:s5: grep WER exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.08 [ 3239 / 20138, 272 ins, 484 del, 2483 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 13.16 [ 2651 / 20138, 236 ins, 402 del, 2013 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +## Then after introducing model combination we got: +# grep WER exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +#%WER 14.53 [ 2927 / 20138, 301 ins, 347 del, 2279 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +#b10:s5: grep WER exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +#%WER 11.34 [ 2283 / 20138, 234 ins, 303 del, 1746 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 +# And after LM rescoring: +# %WER 8.26 [ 1663 / 20138, 243 ins, 147 del, 1273 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tglarge/wer_10_0.5 + +# the baseline 1i: #a09:s5: grep WER exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh #%WER 16.85 [ 3393 / 20138, 310 ins, 481 del, 2602 sub ] exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 #a09:s5: grep WER exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh #%WER 13.37 [ 2693 / 20138, 243 ins, 398 del, 2052 sub ] exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 +# a rerun of the baseline 1i: +#a09:s5: grep WER exp/chaina/tdnn1i2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +#%WER 16.71 [ 3365 / 20138, 255 ins, 567 del, 2543 sub ] exp/chaina/tdnn1i2_sp/decode_dev_clean_2_tgsmall.si/wer_12_0.0 +#a09:s5: grep WER exp/chaina/tdnn1i2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +#%WER 13.28 [ 2675 / 20138, 259 ins, 374 del, 2042 sub ] exp/chaina/tdnn1i2_sp/decode_dev_clean_2_tgsmall/wer_11_0.0 # 1i is as 1h but replacing half the mean-transformed dims with fMLLR in blocks of 8. @@ -424,136 +443,20 @@ if [ $stage -le 24 ]; then fi if [ $stage -le 25 ]; then - # Do the speaker-dependent decoding pass + # Do the speaker-dependent decoding pass and LM rescoring test_sets=dev_clean_2 for data in $test_sets; do steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \ - data/${data}_hires $tree_dir/graph_tgsmall\ - $dir/final $dir/data/final/${data} \ - $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall - done -fi - - -exit 0; - - - # Work out the model - # The following script is equivalent to doing something like the - # following: - # cat > $dir/init/info.txt < $dir/init/info.txt -fi + data/${data}_hires $tree_dir/graph_tgsmall\ + $dir/final $dir/data/final/${data} \ + $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall -if [ $stage -le 14 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage=$train_stage \ - --cmd="$decode_cmd" \ - --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient=0.1 \ - --chain.l2-regularize=0.0 \ - --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.dropout-schedule $dropout_schedule \ - --trainer.add-option="--optimization.memory-compression-level=2" \ - --trainer.srand=$srand \ - --trainer.max-param-change=2.0 \ - --trainer.num-epochs=20 \ - --trainer.frames-per-iter=3000000 \ - --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=5 \ - --trainer.optimization.initial-effective-lrate=0.002 \ - --trainer.optimization.final-effective-lrate=0.0002 \ - --trainer.num-chunk-per-minibatch=128,64 \ - --egs.chunk-width=$chunk_width \ - --egs.dir="$common_egs_dir" \ - --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ - --use-gpu=true \ - --reporting.email="$reporting_email" \ - --feat-dir=$train_data_dir \ - --tree-dir=$tree_dir \ - --lat-dir=$lat_dir \ - --dir=$dir || exit 1; -fi - -if [ $stage -le 15 ]; then - # Note: it's not important to give mkgraph.sh the lang directory with the - # matched topology (since it gets the topology file from the model). - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test_tgsmall \ - $tree_dir $tree_dir/graph_tgsmall || exit 1; -fi - -if [ $stage -le 16 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - nspk=$(wc -l /dev/null || true - - for data in $test_sets; do - ( - nspk=$(wc -l Copy(); +} + + } // namespace differentiable_transform } // namespace kaldi diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h index 6dd75d3137f..e2842cf6af0 100644 --- a/src/adapt/differentiable-transform-itf.h +++ b/src/adapt/differentiable-transform-itf.h @@ -382,11 +382,11 @@ class DifferentiableTransform { DifferentiableTransform(): num_classes_(-1) { } + virtual ~DifferentiableTransform() { } + protected: DifferentiableTransform(const DifferentiableTransform &other): num_classes_(other.num_classes_) { } - virtual ~DifferentiableTransform() { } - protected: int32 num_classes_; }; @@ -433,6 +433,8 @@ struct DifferentiableTransformMapped { ~DifferentiableTransformMapped() { delete transform; } + // Copy constructor + DifferentiableTransformMapped(const DifferentiableTransformMapped &other); }; diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h index e453c24f9cb..c41ec2e7b32 100644 --- a/src/hmm/transition-model.h +++ b/src/hmm/transition-model.h @@ -251,6 +251,7 @@ class TransitionModel { /// compare the transition probabilities. bool Compatible(const TransitionModel &other) const; + TransitionModel(const TransitionModel &other) = default; private: void MleUpdateShared(const Vector &stats, const MleTransitionUpdateConfig &cfg, @@ -321,7 +322,8 @@ class TransitionModel { /// of pdfs). int32 num_pdfs_; - KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel); + // Disallow assignment by making it private; this won't be defined. + TransitionModel &operator=(const TransitionModel &other); }; inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const { diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index e692ae1762e..c20dc9d425b 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -36,24 +36,88 @@ NnetChainaModels::NnetChainaModels( std::string bottom_nnet_name; // model_dir/bottom.raw GetPathname(model_dir, "bottom", "raw", &bottom_nnet_name); ReadKaldiObject(bottom_nnet_name, &bottom_nnet_); - // we could change that condition later if it turns out to be a problem. - if (opts_.nnet_config.zero_component_stats && - !opts_.bottom.batchnorm_test_mode) - ZeroComponentStats(&bottom_nnet_); ComputeSimpleNnetContext(bottom_nnet_, &bottom_nnet_left_context_, &bottom_nnet_right_context_); - if (opts_.bottom.batchnorm_test_mode) - SetBatchnormTestMode(true, &bottom_nnet_); - if (opts_.bottom.dropout_test_mode) - SetDropoutTestMode(true, &bottom_nnet_); - if (!opts_.bottom.train && opts_.bottom.batchnorm_test_mode) { - // The following is for efficiency in evaluating the bottom nnet, + bool is_top_nnet = false; + InitializeNnet(is_top_nnet, &bottom_nnet_); +} + +void NnetChainaModels::InitializeNnet( + bool is_top_nnet, Nnet *nnet) const { + const NnetChainaTrainingPerModelOptions &bottom_or_top_opts = + (is_top_nnet ? opts_.top : opts_.bottom); + + // we could change that condition later if it turns out to be a problem. + if (bottom_or_top_opts.batchnorm_test_mode) + SetBatchnormTestMode(true, nnet); + if (bottom_or_top_opts.dropout_test_mode) + SetDropoutTestMode(true, nnet); + if (!bottom_or_top_opts.train && bottom_or_top_opts.batchnorm_test_mode) { + // The following is for efficiency in evaluating the nnet; // it may combine certain component types. - CollapseModel(CollapseModelConfig(), &bottom_nnet_); + CollapseModel(CollapseModelConfig(), nnet); + } +} + +NnetChainaModels::LanguageInfo::LanguageInfo( + const NnetChainaModels::LanguageInfo &other): + trans_model(other.trans_model), + am_nnet(other.am_nnet), + den_fst(other.den_fst), + transform(other.transform) { } + + +// This code is related to UpdateNnetMovingAverage() in nnet3-chain-combine.cc. +void NnetChainaModels::InterpolateWith( + BaseFloat new_model_weight, + const std::string &model_dir) { + KALDI_ASSERT(new_model_weight > 0.0 && new_model_weight < 1.0); + + std::string bottom_filename; + GetPathname(model_dir, "bottom", "raw", &bottom_filename); + Nnet bottom_nnet; // we don't need the transition model, and the reading code + // is capable of ignoring it. + ReadKaldiObject(bottom_filename, &bottom_nnet); + bool is_top_nnet = false; + InitializeNnet(is_top_nnet, &bottom_nnet); + ScaleNnet(1.0 - new_model_weight, &bottom_nnet_); + AddNnet(bottom_nnet, new_model_weight, &bottom_nnet_); + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang = iter->first; + LanguageInfo *info = iter->second; + std::string model_filename; + GetPathname(model_dir, lang, "mdl", &model_filename); + Nnet top_nnet; // we don't need the transition model, and the reading code + // is capable of ignoring it. + ReadKaldiObject(model_filename, &top_nnet); + is_top_nnet = true; + InitializeNnet(is_top_nnet, &top_nnet); + Nnet &stored_nnet = info->am_nnet.GetNnet(); + ScaleNnet(1.0 - new_model_weight, &stored_nnet); + AddNnet(top_nnet, new_model_weight, &stored_nnet); + } +} + + +NnetChainaModels::NnetChainaModels(const NnetChainaModels &other): + opts_(other.opts_), + model_dir_(other.model_dir_), + den_fst_dir_(other.den_fst_dir_), + transform_dir_(other.transform_dir_), + bottom_nnet_(other.bottom_nnet_), + bottom_nnet_left_context_(other.bottom_nnet_left_context_), + bottom_nnet_right_context_(other.bottom_nnet_right_context_) { + for (auto iter = other.lang_info_.begin(); + iter != other.lang_info_.end(); ++iter) { + const std::string &lang = iter->first; + LanguageInfo *info = iter->second; + lang_info_[lang] = new LanguageInfo(*info); } } + + void NnetChainaModels::GetPathname(const std::string &dir, const std::string &name, const std::string &suffix, @@ -92,17 +156,8 @@ NnetChainaModels::LanguageInfo *NnetChainaModels::GetInfoForLang( info->trans_model.Read(ki.Stream(), binary); info->am_nnet.Read(ki.Stream(), binary); Nnet &nnet = info->am_nnet.GetNnet(); - if (opts_.nnet_config.zero_component_stats && - !opts_.top.batchnorm_test_mode) - ZeroComponentStats(&nnet); - if (opts_.top.batchnorm_test_mode) - SetBatchnormTestMode(true, &nnet); - if (opts_.top.dropout_test_mode) - SetDropoutTestMode(true, &nnet); - // The following is for efficiency in evaluating the top nnet, - // it may combine certain component types. - if (!opts_.top.train && opts_.top.batchnorm_test_mode) - CollapseModel(CollapseModelConfig(), &nnet); + bool is_top_nnet = true; + InitializeNnet(is_top_nnet, &nnet); } ReadFstKaldi(den_fst_filename, &(info->den_fst)); ReadKaldiObject(transform_filename, &(info->transform)); @@ -191,6 +246,29 @@ void NnetChainaModels::Write(const std::string &model_out_dir, } +void NnetChainaModels::WriteCombinedModels(const std::string &model_out_dir, + bool binary) { + + std::string bottom_model_name; + GetPathname(model_out_dir, "bottom", "raw", &bottom_model_name); + WriteKaldiObject(bottom_nnet_, bottom_model_name, binary); + + std::ostringstream ss; + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang_name = iter->first; + ss << lang_name << " "; + LanguageInfo *info = iter->second; + std::string top_model_name; + GetPathname(model_out_dir, lang_name, "mdl", &top_model_name); + + Output ko(top_model_name, binary); + info->trans_model.Write(ko.Stream(), binary); + info->am_nnet.Write(ko.Stream(), binary); + } + KALDI_LOG << "Wrote bottom.raw and .mdl files for languages:" + << ss.str() << "to: " << model_out_dir; +} + NnetChainaModels::~NnetChainaModels() { for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) delete iter->second; @@ -215,7 +293,8 @@ NnetChainaTopTrainer::NnetChainaTopTrainer( config.Check(); - if (opts_.nnet_config.zero_component_stats) + if (opts_.nnet_config.zero_component_stats && + !opts_.top.batchnorm_test_mode) ZeroComponentStats(nnet); ScaleNnet(0.0, delta_nnet_); @@ -749,6 +828,13 @@ void ConvertPosterior( } } +BaseFloat NnetChainaTopTrainer::GetTotalObjf(bool adapted, BaseFloat *weight) const { + const ObjectiveFunctionInfo &objf = + (adapted ? output_objf_ : output_si_objf_); + *weight = objf.tot_weight; + return objf.tot_objf; +} + bool NnetChainaTopTrainer::PrintTotalStats() const { bool ans = false; if (output_si_objf_.PrintTotalStats(lang_name_ + ":output-si")) @@ -879,7 +965,8 @@ NnetChainaBottomTrainer::NnetChainaBottomTrainer( compiler_(*nnet, opts_.nnet_config.optimize_config, opts_.nnet_config.compiler_config), max_change_stats_(*nnet) { - if (opts_.nnet_config.zero_component_stats) + if (opts_.nnet_config.zero_component_stats && + !opts_.bottom.batchnorm_test_mode) ZeroComponentStats(nnet); ScaleNnet(0.0, delta_nnet_); if (opts_.nnet_config.read_cache != "") { @@ -976,6 +1063,19 @@ void NnetChainaTrainer::GetContextInfo( } +BaseFloat NnetChainaTrainer::GetTotalObjf( + bool adapted, BaseFloat *weight) const { + *weight = 0.0; + BaseFloat tot_objf = 0.0; + for (auto iter = top_trainers_.begin(); iter != top_trainers_.end(); + ++iter) { + BaseFloat this_weight; + tot_objf += iter->second->GetTotalObjf(adapted, &this_weight); + *weight += this_weight; + } + return tot_objf; +} + bool NnetChainaTrainer::PrintTotalStats() const { bottom_trainer_.PrintTotalStats(); bool ans = false; diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h index aa0b40b686e..559fb9dfba4 100644 --- a/src/nnet3a/nnet-chaina-training.h +++ b/src/nnet3a/nnet-chaina-training.h @@ -195,6 +195,23 @@ class NnetChainaModels { const std::string &den_fst_dir, const std::string &transform_dir); + // Copy constructor + NnetChainaModels(const NnetChainaModels &other); + + + /* + This interpolates the (top and bottom) models stored here with the one in + 'model_dir', giving a weight 0 < new_model_weight < 1 to the new models. + All models currently loaded will be looked for (this depends what + languages were present in the egs), so you need to actually use this + object for training or objective evaluation before calling this function + on it. + */ + void InterpolateWith( + BaseFloat new_model_weight, + const std::string &model_dir); + + Nnet* GetBottomNnet(); /** @@ -230,6 +247,15 @@ class NnetChainaModels { bool binary, int32 job_id); + // This is a version of Write() is specialized for use by the + // model-combination code; it differs from the Write() above in + // that it writes out all models we have (ignoring whether or not + // they were trained), and it writes out the 'top' models as + // .mdl files (including the transition models). + void WriteCombinedModels(const std::string &model_out_dir, + bool binary); + + ~NnetChainaModels(); private: // This function sets "pathname" to the string: @@ -239,8 +265,11 @@ class NnetChainaModels { const std::string &suffix, std::string *pathname); - // This version of GetPathname() sets "pathname" to the string: + // If job_id is >= 0, then this version of GetPathname() sets "pathname" to + // the string: // /.. + // otherwise (job_id < 0) it sets it to + // /. void GetPathname(const std::string &dir, const std::string &name, int32 job_id, @@ -257,8 +286,14 @@ class NnetChainaModels { fst::StdVectorFst den_fst; // transform comes from /.ada differentiable_transform::DifferentiableTransformMapped transform; + LanguageInfo() { } + // Copy constructor + LanguageInfo(const LanguageInfo &other); }; + // Depending on opts_, this function may zero the component stats, set test + // mode for batchnorm and/or dropout components, and do model-collapsing. + void InitializeNnet(bool is_top_nnet, Nnet *nnet) const; // get the LanguageInfo* for this language, creating it (and reading its // contents from disk) if it does not already exist. @@ -279,7 +314,6 @@ class NnetChainaModels { int32 bottom_nnet_left_context_; int32 bottom_nnet_right_context_; - std::unordered_map lang_info_; }; @@ -379,6 +413,15 @@ class NnetChainaTopTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; + + // Returns the total objective-function value for the adapted computation (if + // adapted == true), or the unadapted/speaker-independent computation + // otherwise, with the corresponding weight (which can be interpreted as a + // frame count) written to 'weight'. The returned value would normally be + // divided by 'weight' before being displayed. + BaseFloat GetTotalObjf(bool adapted, BaseFloat *weight) const; + + // Calls kaldi::nnet3::ConsolidateMemory() on nnet_ and delta_nnet_; we do // this after the first minibatch of training, to reduce fragmentation. void ConsolidateMemory(); @@ -819,6 +862,14 @@ class NnetChainaTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; + // Returns the total objective-function value, summed over all languages + // present, for the adapted computation (if adapted == true), or the + // unadapted/speaker-independent computation otherwise, with the corresponding + // weight (which can be interpreted as a frame count) written to 'weight'. + // The returned value would normally be divided by 'weight' before being + // displayed. + BaseFloat GetTotalObjf(bool adapted, BaseFloat *weight) const; + // Prints out the max-change stats (if nonzero): the percentage of time that // per-component max-change and global max-change were enforced. void PrintMaxChangeStats() const; diff --git a/src/nnet3abin/Makefile b/src/nnet3abin/Makefile index d763dcf9cc5..224c45a5bcd 100644 --- a/src/nnet3abin/Makefile +++ b/src/nnet3abin/Makefile @@ -6,7 +6,7 @@ include ../kaldi.mk LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) -BINFILES = nnet3-adapt nnet3-chaina-train +BINFILES = nnet3-adapt nnet3-chaina-train nnet3-chaina-combine OBJFILES = diff --git a/src/nnet3abin/nnet3-chaina-combine.cc b/src/nnet3abin/nnet3-chaina-combine.cc new file mode 100644 index 00000000000..857b231a076 --- /dev/null +++ b/src/nnet3abin/nnet3-chaina-combine.cc @@ -0,0 +1,246 @@ +// nnet3bin/nnet3-chaina-combine.cc + +// Copyright 2019 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3a/nnet-chaina-training.h" +#include "cudamatrix/cu-allocator.h" + +namespace kaldi { +namespace nnet3 { + + +/** + Computes the average objective function of the provided egs with the provided + set of models. + @param [in] opts The options class for the objective computation + (shares the same option as training, but we set the training + options to true) + @param [in] unadapted_objf_weight A number in the range [0,1] that says + how much weight we put on the unadapted version of the + objective function when choosing models. + @param [in] num_models_averaged Needed only for diagnostics-- the + number of sets of models that we averaged to get the + models in the 'models' object + @param [in] keys_and_egs The vector containing the examples we + are to evaluate the objective function on, and the corresponding + string-valued keys (needed because the language name and + example weight are optionally encoded in it). + @param [in,out] models The models that we are evaluating the objective + function. These will only be modified to to the extent that + the batchnorm stats and any component-level stats would be + affected. + */ +BaseFloat GetObjectiveFunction( + const NnetChainaTrainingOptions &opts, + BaseFloat unadapted_objf_weight, + int32 num_models_averaged, + const std::vector >& keys_and_egs, + NnetChainaModels *models) { + KALDI_ASSERT(!opts.top.train && !opts.bottom.train); + NnetChainaTrainer trainer(opts, models); + size_t num_egs = keys_and_egs.size(); + for (size_t i = 0; i < num_egs; i++) { + trainer.Train(keys_and_egs[i].first, keys_and_egs[i].second); + } + BaseFloat weight, adapted_objf, unadapted_objf; + adapted_objf = trainer.GetTotalObjf(true, &weight); + adapted_objf /= weight; + unadapted_objf = trainer.GetTotalObjf(false, &weight); + unadapted_objf /= weight; + BaseFloat ans = unadapted_objf_weight * unadapted_objf + + (1.0 - unadapted_objf_weight) * adapted_objf; + KALDI_LOG << "When averaging " << num_models_averaged + << " models, objf values (unadapted/si,adapted) " + << unadapted_objf << ", " << adapted_objf + << ", interpolated = " << ans << "; over " + << weight << " frames."; + return ans; +} + +void ReadExamples( + const std::string &egs_rspecifier, + std::vector > *keys_and_egs) { + keys_and_egs->reserve(10000); // reserve a lot of space to minimize the chance of + // reallocation. + SequentialNnetChainExampleReader example_reader(egs_rspecifier); + for (; !example_reader.Done(); example_reader.Next()) { + size_t i = keys_and_egs->size(); + keys_and_egs->resize(i + 1); + keys_and_egs->back().first = example_reader.Key(); + keys_and_egs->back().second.Swap(&(example_reader.Value())); + } + KALDI_LOG << "Read " << keys_and_egs->size() << " examples."; + KALDI_ASSERT(!keys_and_egs->empty()); +} + + +} // namespace nnet3 +} // namespace kaldi + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "This program does the final model-combination stage of 'chaina'\n" + "acoustic training: it averages over the last n models, where the\n" + "'n' is chosen (by this program) based on maximizing the objective\n" + "function on the data given to it. It maximizes the average of the\n" + "speaker-independent and speaker-dependent versions of the 'chain'\n" + "objective values.\n" + "This program is intended to be used with a GPU.\n" + "\n" + "Usage: nnet3-chaina-combine [options] ... \\\n" + " \n" + "\n" + " should contain bottom.raw, and .mdl for each language \n" + " (these will be averaged over a range of indexes including N, e.g. just modelN, or\n" + " modelN with model(N-1), and so on).\n" + " should contain .den.fst for each language \n" + " should contain .ada for each language \n" + " is a place to where bottom.mdl and .mdl for each language\n" + " that was seen in the egs, will be written (for , see the --job-id option).\n"; + + + int32 srand_seed = 0; + bool binary_write = true; + std::string use_gpu = "yes"; + NnetChainaTrainingOptions chaina_opts; + chaina_opts.top.train = false; + chaina_opts.bottom.train = false; + chaina_opts.top.dropout_test_mode = true; + chaina_opts.bottom.dropout_test_mode = true; + // But leave the batchnorm test-modes at false. + + // Setting batchnorm_stats_scale to 1.0 means it won't scale down the + // batchnorm stats as it goes (the default is 0.8), so they will all be + // remembered. Note: each time we initialize and use the trainer object, in + // GetObjectiveFunction, it will call ZeroComponentStats() for both the + // bottom and top models (assuming the options are the defaults), so only + // the stats from the most recent run will be present. + chaina_opts.nnet_config.batchnorm_stats_scale = 1.0; + + BaseFloat unadapted_objf_weight = 0.5; + + ParseOptions po(usage); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("unadapted-weight", &unadapted_objf_weight, + "The weight we give to the unadapted version of the objective function " + "when evaluating the goodness of models (the adapted objective gets " + "1 minus this value as its weight)"); + + + chaina_opts.Register(&po); + RegisterCuAllocatorOptions(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 5) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + int32 n = po.NumArgs() - 4; // n is the number of models we have + // available to average. + + std::string last_model_in_dir = po.GetArg(n), + den_fst_dir = po.GetArg(n + 1), + transform_dir = po.GetArg(n + 2), + egs_rspecifier = po.GetArg(n + 3), + model_out_dir = po.GetOptArg(n + 4); + + NnetChainaModels models(chaina_opts, + last_model_in_dir, den_fst_dir, + transform_dir); + + + std::vector > keys_and_egs; + ReadExamples(egs_rspecifier, &keys_and_egs); + + // first evaluates the objective using the last model. + int32 best_num_to_combine = -1; + BaseFloat best_objf = -std::numeric_limits::infinity(), + single_model_objf; + + std::unique_ptr best_models; + + for (int32 num_models = 1; num_models <= n; num_models++) { + if (num_models > 1) + models.InterpolateWith(1.0 / num_models, po.GetArg(n + 1 - num_models)); + BaseFloat objf = GetObjectiveFunction(chaina_opts, unadapted_objf_weight, + num_models, keys_and_egs, &models); + if (objf > best_objf || num_models == 1) { + best_objf = objf; + best_models = std::unique_ptr( + new NnetChainaModels(models)); + best_num_to_combine = num_models; + if (num_models == 1) + single_model_objf = objf; + } + if (num_models > best_num_to_combine + 4 && num_models < n) + KALDI_LOG << "Stopping the search early as it looks like we found " + "the best combination"; + } + + KALDI_LOG << "Best objective function was " << best_objf << " with " + << best_num_to_combine << " models."; + KALDI_LOG << "About to recompute objective function with batchnorm in " + "test-mode:\n"; + chaina_opts.top.batchnorm_test_mode = true; + chaina_opts.bottom.batchnorm_test_mode = true; + + BaseFloat test_mode_objf = + GetObjectiveFunction(chaina_opts, unadapted_objf_weight, + best_num_to_combine, + keys_and_egs, + best_models.get()); + KALDI_LOG << "Objf with test-mode batchnorm was " << test_mode_objf + << " (vs. " << best_objf << " without test mode)"; + + KALDI_LOG << "Combination changed the objective from " + << single_model_objf << " with only the final model, to " + << best_objf << " with " << best_num_to_combine + << " models."; + + best_models->WriteCombinedModels(model_out_dir, binary_write); + +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + return 0; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} From d204bca86931b27dd3602c8f1904933356f20a7d Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sat, 26 Jan 2019 02:10:06 -0500 Subject: [PATCH 85/87] Set random seed in choose_egs_to_merge.py --- .../s5/steps/chaina/internal/choose_egs_to_merge.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py index 52582fdcde4..a4e8a44c1cd 100755 --- a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -37,6 +37,8 @@ def get_args(): epilog="E.g. " + sys.argv[0] + "*** TODO *** ", formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--random-seed', type=int, + default = 123, help='Random seed.') parser.add_argument("--chunks-per-group", type=int, default=4, help="Number of chunks per speaker in the final egs (actually " "means the number of chunks per group of chunks, and they are " @@ -222,7 +224,8 @@ def write_egs(filename, group_indexes, all_groups): def choose_egs(args): """ The main part of the program. """ - + random.seed(args.random_seed) + logger.info('Set random seed to {}.'.format(args.random_seed)) all_chunks = read_all_chunks(args.scp_in) logger.info('Loaded {} chunks.'.format(len(all_chunks))) @@ -239,9 +242,10 @@ def choose_egs(args): assert(args.num_repeats == 1 or args.num_repeats == 2) groups = [] # All groups from all sub-lists - for sublist in chunk_to_sublist.values(): + for context_structure in sorted(chunk_to_sublist.keys()): + sublist = chunk_to_sublist[context_structure] logger.info('Processing chunks with context ' - 'structure: {}'.format(sublist[0].context_structure)) + 'structure: {}'.format(context_structure)) num_groups = (len(sublist) + args.chunks_per_group - 1) // args.chunks_per_group for i in range(num_groups): From 093290b9a384e2a79c765f638b2a15e22bb5df7b Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 29 Jan 2019 00:53:10 -0500 Subject: [PATCH 86/87] [src] Bug fix; disable freezing of NG (which seems to hurt) --- egs/wsj/s5/steps/nnet3/chain/align_lats.sh | 149 +++++++++++++++++++++ src/adapt/differentiable-fmllr.cc | 6 +- src/nnet3a/nnet-chaina-training.cc | 8 +- 3 files changed, 157 insertions(+), 6 deletions(-) create mode 100755 egs/wsj/s5/steps/nnet3/chain/align_lats.sh diff --git a/egs/wsj/s5/steps/nnet3/chain/align_lats.sh b/egs/wsj/s5/steps/nnet3/chain/align_lats.sh new file mode 100755 index 00000000000..ed10735245d --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/align_lats.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2012 Brno University of Technology (Author: Karel Vesely) +# 2013 Johns Hopkins University (Author: Daniel Povey) +# 2015 Vijayaditya Peddinti +# 2016 Vimal Manohar +# 2017 Pegah Ghahremani +# Apache 2.0 + +# Computes training alignments using nnet3 DNN, with output to lattices. + +# Begin configuration section. +nj=4 +cmd=run.pl +stage=-1 +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=1.0" +acoustic_scale=1.0 +post_decode_acwt=10.0 +beam=20 +iter=final +frames_per_chunk=50 +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +graphs_scp= +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split${nj} +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \ + split_data.sh $data $nj || exit 1; + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +cp $srcdir/{tree,${iter}.mdl} $dir || exit 1; + +utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; +## Set up features. Note: these are different from the normal features +## because we have one rspecifier that has the features for the entire +## training set, not separate ones for each batch. +echo "$0: feature type is raw" + +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + +ivector_opts= +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor) + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + cp $srcdir/frame_subsampling_factor $dir + if [ "$frame_subsampling_factor" -gt 1 ] && \ + [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then + echo "$0: frame-subsampling-factor is not 1 (so likely a chain system)," + echo "... but the scale opts are the defaults. You probably want" + echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" + sleep 1 + fi +fi + +if [ ! -z "$graphs_scp" ]; then + if [ ! -f $graphs_scp ]; then + echo "Could not find graphs $graphs_scp" && exit 1 + fi + tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |" + prog=compile-train-graphs-fsts +else + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + prog=compile-train-graphs +fi + +if [ $stage -le 0 ]; then + ## because nnet3-latgen-faster doesn't support adding the transition-probs to the + ## graph itself, we need to bake them into the compiled graphs. This means we can't reuse previously compiled graphs, + ## because the other scripts write them without transition probs. + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + $prog --read-disambig-syms=$lang/phones/disambig.int \ + $scale_opts \ + $dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1 +fi + +if [ $stage -le 1 ]; then + # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more + # alignment errors (however, it does have a default min-active=200 so this + # will tend to reduce alignment errors). + # --allow_partial=false makes sure we reach the end of the decoding graph. + # --word-determinize=false makes sure we retain the alternative pronunciations of + # words (including alternatives regarding optional silences). + # --lattice-beam=$beam keeps all the alternatives that were within the beam, + # it means we do no pruning of the lattice (lattices from a training transcription + # will be small anyway). + $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \ + nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --beam=$beam --lattice-beam=$beam \ + --allow-partial=false --word-determinize=false \ + $srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \ + "$feats" "ark:|lattice-copy --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" || exit 1; +fi + +echo "$0: done generating lattices from training transcripts." diff --git a/src/adapt/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc index 66bda183cfe..faabc7b1496 100644 --- a/src/adapt/differentiable-fmllr.cc +++ b/src/adapt/differentiable-fmllr.cc @@ -283,7 +283,7 @@ void GaussianEstimator::Estimate(const FmllrEstimatorOptions &opts) { t_.Resize(num_classes, kUndefined); for (int32 i = 0; i < num_classes; i++) { BaseFloat gamma_i = gamma_(i); - if (gamma_i == 0.0) { + if (gamma_i < 1.0e-10) { // the i'th row of mu will already be zero. s_(i) = variance_floor_; } else { @@ -304,6 +304,8 @@ void GaussianEstimator::Estimate(const FmllrEstimatorOptions &opts) { for (int32 i = 0; i < num_classes; i++) { t_(i) = (BaseFloat(1.0) - f) * s_(i) + f * s; } + { BaseFloat sum = mu_.Sum(); KALDI_ASSERT(sum - sum == 0); } // TEMP + // Clear the stats, which won't be needed any longer. m_.Resize(0, 0); v_.Resize(0); @@ -332,7 +334,7 @@ void GaussianEstimator::AddToOutputDerivs( for (int32 i = 0; i < num_classes; i++) { SubVector m_bar_i(m_bar_, i); BaseFloat gamma_i = gamma_(i); - if (gamma_i != 0.0) { + if (gamma_i > 1.0e-10) { if (s_(i) != variance_floor) { BaseFloat s_bar_i = (BaseFloat(1.0) - f) * t_bar(i) + s_bar * gamma_i / gamma; v_bar_(i) += s_bar_i / (gamma_i * dim); diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc index c20dc9d425b..340b4dece7d 100644 --- a/src/nnet3a/nnet-chaina-training.cc +++ b/src/nnet3a/nnet-chaina-training.cc @@ -497,8 +497,8 @@ bool NnetChainaTopTrainer::TrainUnadapted( // matrices on this data because we'll next be running the same nnet on the // speaker-adapted version of the same data, and it would violate the // independence assumptions needed for NG to work if we updated them. - if (need_model_deriv) - FreezeNaturalGradient(true, delta_nnet_); + //if (need_model_deriv) + // FreezeNaturalGradient(true, delta_nnet_); computer.AcceptInput("output-si", &output_deriv); @@ -511,8 +511,8 @@ bool NnetChainaTopTrainer::TrainUnadapted( if (input_deriv != NULL) computer.GetOutputDestructive("input", input_deriv); - if (need_model_deriv) // Un-freeze the natural gradient. - FreezeNaturalGradient(false, delta_nnet_); + //if (need_model_deriv) // Un-freeze the natural gradient. + // FreezeNaturalGradient(false, delta_nnet_); // We'll wait until after the adapted pass to call UpdateNnetWithMaxChange(). // Training the model on these features in between the two passes would leave From 50eb3c48ca0ef869d48aa448fac69c29aa5a9c29 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 29 Jan 2019 00:54:23 -0500 Subject: [PATCH 87/87] [scripts] chaina training script fix --- egs/wsj/s5/steps/chaina/train.sh | 100 ++++++++++++++++--------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh index 7294fc5d24d..0bfefd43b21 100755 --- a/egs/wsj/s5/steps/chaina/train.sh +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -20,8 +20,8 @@ use_gpu=yes # can be "yes", "no", "optional", "wait" common_opts= # Options passed through to nnet3-chaina-train and nnet3-chaina-combine -unadapted_top_weight=0.5 -unadapted_bottom_weight=0.5 +top_unadapted_weight=0.5 +bottom_unadapted_weight=0.5 num_epochs=4.0 # Note: each epoch may actually contain multiple repetitions of # the data, for various reasons: @@ -44,7 +44,6 @@ diagnostic_period=5 # Get diagnostics every this-many iterations shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the groups # on each iter. -train=true # use --train false to run only diagnostics. @@ -172,59 +171,64 @@ while [ $x -lt $num_iters ]; do done fi - if $train; then - if [ -d $dir/$next_x ]; then - echo "$0: removing previous contents of $dir/$next_x" - rm -r $dir/$next_x - fi - mkdir -p $dir/$next_x - - for j in $(seq $num_jobs); do - scp_index=${scp_indexes[$j]} - frame_shift=${frame_shifts[$j]} - - $cmd $gpu_cmd_opt $dir/log/train.$x.$j.log \ - nnet3-chaina-train --job-id=$j --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \ - --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ - --bottom-subsampling-factor=$bottom_subsampling_factor \ - --print-interval=10 --max-param-change=$max_param_change \ - --l2-regularize-factor=$inv_num_jobs --optimization.memory-compression-level=$memory_compression_level \ - $model_in_dir $den_fst_dir $transform_dir \ - "ark:nnet3-chain-copy-egs --frame-shift=$frame_shift scp:$egs_dir/train.$scp_index.scp ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch ark:- ark:-|" \ - $model_out_dir || touch $dir/$next_x/.error & - done - wait - if [ -f $dir/$next_x/.error ]; then - echo "$0: error detected training on iteration $x" - exit 1 - fi - # First average the bottom models - models=$(for j in $(seq $num_jobs); do echo $dir/$next_x/bottom.$j.raw; done) - run.pl $dir/log/average.$x.log \ - nnet3-average $models - \| \ - nnet3-copy --learning-rate=$lrate $dropout_opt - $dir/$next_x/bottom.raw - rm $models - for lang in $langs; do - models=$dir/$next_x/$lang.*.raw - run.pl $dir/log/average_${lang}.$x.log \ - nnet3-average $models - \| \ - nnet3-am-copy --set-raw-nnet=- --learning-rate=$lrate $dropout_opt $dir/$iter/$lang.mdl $dir/$next_x/$lang.mdl - rm $models - done + if [ -d $dir/$next_x ]; then + echo "$0: removing previous contents of $dir/$next_x" + rm -r $dir/$next_x fi + mkdir -p $dir/$next_x + + for j in $(seq $num_jobs); do + scp_index=${scp_indexes[$j]} + frame_shift=${frame_shifts[$j]} + $cmd $gpu_cmd_opt $dir/log/train.$x.$j.log \ + nnet3-chaina-train --job-id=$j --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \ + --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + --bottom-subsampling-factor=$bottom_subsampling_factor \ + --top.unadapted-weight=$top_unadapted_weight --bottom.unadapted-weight=$bottom_unadapted_weight \ + --print-interval=10 --max-param-change=$max_param_change \ + --l2-regularize-factor=$inv_num_jobs --optimization.memory-compression-level=$memory_compression_level \ + $model_in_dir $den_fst_dir $transform_dir \ + "ark:nnet3-chain-copy-egs --frame-shift=$frame_shift scp:$egs_dir/train.$scp_index.scp ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch ark:- ark:-|" \ + $model_out_dir || touch $dir/$next_x/.error & + done wait - if [ -f $dir/$x/.error_diagnostic ]; then - echo "$0: error detected in diagnostics on iteration $x" + if [ -f $dir/$next_x/.error ]; then + echo "$0: error detected training on iteration $x" exit 1 fi + # First average the bottom models + models=$(for j in $(seq $num_jobs); do echo $dir/$next_x/bottom.$j.raw; done) + run.pl $dir/log/average.$x.log \ + nnet3-average $models - \| \ + nnet3-copy --learning-rate=$lrate $dropout_opt - $dir/$next_x/bottom.raw + rm $models + for lang in $langs; do + models=$dir/$next_x/$lang.*.raw + run.pl $dir/log/average_${lang}.$x.log \ + nnet3-average $models - \| \ + nnet3-am-copy --set-raw-nnet=- --learning-rate=$lrate $dropout_opt $dir/$iter/$lang.mdl $dir/$next_x/$lang.mdl + rm $models + done + wait + [ -f $dir/$x/.error_diagnostic ] && echo "$0: error getting diagnostics on iter $x" && exit 1; + + $cmd $dir/log/progress_bottom.$x.log \ + nnet3-show-progress $dir/$x/bottom.raw $dir/$next_x/bottom.raw '&&' \ + nnet3-info $dir/$next_x/bottom.raw || touch $dir/$next_x/.error & + for lang in $langs; do + $cmd $dir/log/progress_${lang}.$x.log \ + nnet3-show-progress $dir/$x/$lang.mdl $dir/$next_x/$lang.mdl '&&' \ + nnet3-am-info $dir/$next_x/$lang.mdl || touch $dir/$next_x/.error & + done + [ -f $dir/$next_x/.error ] && echo "$0: error getting progress logs" && exit 1; - # TODO: diagnostics; cleanup + # TODO: cleanup x=$[x+1] done -if [ $stage -le $num_iters ] && $train; then +if [ $stage -le $num_iters ]; then echo "$0: doing model combination" if [ -d $dir/final ]; then echo "$0: removing previous contents of $dir/final" @@ -234,7 +238,7 @@ if [ $stage -le $num_iters ] && $train; then den_fst_dir=$egs_dir/misc [ $max_models_combine -gt $[num_iters/2] ] && max_models_combine=$[num_iters/2]; - input_model_dirs=$(for x in $(seq $max_models_combine); do echo $dir/$[num_iters+1-x]; done) + input_model_dirs=$(for x in $(seq $[num_iters+1-max_models_combine] $num_iters); do echo $dir/$x; done) output_model_dir=$dir/final transform_dir=$dir/init @@ -249,7 +253,7 @@ if [ $stage -le $num_iters ] && $train; then fi -if [ $stage -le $[num_iters+1] ] && $train; then +if [ $stage -le $[num_iters+1] ]; then # Now accumulate the class-dependent mean (and variance) stats of the # adaptation model, which will be needed for decoding. We remove the map that # had reduced the num-classes from several thousand to (e.g.) 200, because we