/*
* Back propagation of the unfolded LDA model (Mirror descent approach)
*/
// Implemented without atomic operation
public static void BackPropagation_LDA(SparseMatrix Xt, SparseMatrix Dt, DNNRun_t DNNRun, paramModel_t paramModel, Grad_t Grad)
{
// -------- Extract parameters --------
int nHid = paramModel.nHid;
int nHidLayer = paramModel.nHidLayer;
int nOutput = paramModel.nOutput;
float To = paramModel.To;
string OutputType = paramModel.OutputType;
int BatchSize = Xt.nCols;
int nInput = paramModel.nInput;
// -------- Back propagation --------
DenseMatrix grad_Q_po = new DenseMatrix(DNNRun.y);
SparseMatrix TmpSparseMat = new SparseMatrix(Xt);
SparseMatrix grad_Q_po_Sparse = new SparseMatrix(Xt);
DenseMatrix xi = new DenseMatrix(nHid, BatchSize);
DenseMatrix TmpDenseMat = new DenseMatrix(nHid, BatchSize);
DenseMatrix ThetaRatio = new DenseMatrix(nHid, BatchSize);
DenseRowVector TmpDenseRowVec = new DenseRowVector(BatchSize);
DenseMatrix tmp_theta_xi_b_T_OVER_theta_lm1_2 = new DenseMatrix(nHid, BatchSize);
SparseMatrix tmp_Xt_OVER_Phitheta = new SparseMatrix(Xt);
SparseMatrix tmp_Phi_theta_xi = new SparseMatrix(Xt);
Grad.grad_Q_Phi.ClearValue();
// ---- Offset of effective number of layers ----
int[] OffsetEffNumLayer = new int[BatchSize];
OffsetEffNumLayer[0] = 0;
int NumTotalLayer = DNNRun.nHidLayerEffective[0];
for (int IdxSample = 1; IdxSample < BatchSize; ++IdxSample)
{
OffsetEffNumLayer[IdxSample] = OffsetEffNumLayer[IdxSample - 1] + DNNRun.nHidLayerEffective[IdxSample-1];
NumTotalLayer += DNNRun.nHidLayerEffective[IdxSample];
}
// ---- Temporary variables that stores the intermediate results for computing the gradients ----
DenseMatrix tmp_theta_xi_pool = new DenseMatrix(nHid, NumTotalLayer, 0.0f);
DenseMatrix tmp_theta_xi = new DenseMatrix(nHid, BatchSize, 0.0f);
DenseMatrix theta_l_minus_one = new DenseMatrix(nHid, NumTotalLayer, 0.0f);
SparseMatrix tmp_Xt_OVER_Phitheta_pool = new SparseMatrix(nInput, NumTotalLayer);
SparseMatrix TmpSparseMat_pool = new SparseMatrix(nInput, NumTotalLayer);
int NumTotalNz = 0;
for (int IdxSample = 0; IdxSample < BatchSize; ++IdxSample)
{
int Layer_begin = OffsetEffNumLayer[IdxSample];
int Layer_end = Layer_begin + DNNRun.nHidLayerEffective[IdxSample];
SparseColumnVector[] tmp1 = tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors;
SparseColumnVector[] tmp2 = TmpSparseMat_pool.SparseColumnVectors;
SparseColumnVector xt = Xt.SparseColumnVectors[IdxSample];
NumTotalNz += xt.nNonzero;
for (int IdxLayer = Layer_begin; IdxLayer < Layer_end; ++IdxLayer)
{
tmp1[IdxLayer] = new SparseColumnVector(xt);
tmp2[IdxLayer] = new SparseColumnVector(xt);
}
}
int[] SparsePatternGradPhi = Xt.GetHorizontalUnionSparsePattern();
SparseMatrix TmpGrad = new SparseMatrix(nInput, nHid, true);
TmpGrad.SetSparsePatternForAllColumn(SparsePatternGradPhi);
// ---- Compute grad Q wrt po if possible ----
switch (OutputType)
{
case "softmaxCE":
MatrixOperation.MatrixSubtractMatrix(grad_Q_po, Dt);
MatrixOperation.ScalarMultiplyMatrix(grad_Q_po, To);
Grad.grad_Q_U.ClearValue();
break;
case "linearQuad":
MatrixOperation.MatrixSubtractMatrix(grad_Q_po, Dt);
MatrixOperation.ScalarMultiplyMatrix(grad_Q_po, 2.0f);
Grad.grad_Q_U.ClearValue();
break;
case "unsupLDA":
Grad.grad_Q_TopPhi.SetAllValuesToZero();
break;
case "linearCE":
throw new Exception("linearCE is not implemented.");
default:
throw new Exception("Unknown OutputType");
}
Parallel.For(0, BatchSize, new ParallelOptions { MaxDegreeOfParallelism = MatrixOperation.MaxMultiThreadDegree }, IdxSample =>
{
// ***************************************************************************
// -------- Back propagation: top layer --------
switch (OutputType)
{
case "softmaxCE":
// ---- grad Q wrt pL (x_L) ----
MatrixOperation.MatrixTransposeMultiplyVector(
xi.DenseMatrixValue[IdxSample],
paramModel.U,
grad_Q_po.DenseMatrixValue[IdxSample]
);
MatrixOperation.ElementwiseVectorMultiplyVector(
TmpDenseMat.DenseMatrixValue[IdxSample],
DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample]
);
TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum();
MatrixOperation.ScalarAddVector(
xi.DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample],
TmpDenseRowVec.VectorValue[IdxSample] * (-1.0f)
);
break;
case "linearQuad":
// ---- grad Q wrt pL (x_L) ----
MatrixOperation.MatrixTransposeMultiplyVector(
xi.DenseMatrixValue[IdxSample],
paramModel.U,
grad_Q_po.DenseMatrixValue[IdxSample]
);
MatrixOperation.ElementwiseVectorMultiplyVector(
TmpDenseMat.DenseMatrixValue[IdxSample],
DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample]
);
TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum();
MatrixOperation.ScalarAddVector(
xi.DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample],
(-1.0f) * TmpDenseRowVec.VectorValue[IdxSample]
);
break;
case "unsupLDA":
// ---- grad Q wrt po ----
MatrixOperation.MatrixMultiplyVector(
grad_Q_po_Sparse.SparseColumnVectors[IdxSample],
paramModel.Phi,
DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample]
);
MatrixOperation.ElementwiseVectorDivideVector(
grad_Q_po_Sparse.SparseColumnVectors[IdxSample],
Xt.SparseColumnVectors[IdxSample],
grad_Q_po_Sparse.SparseColumnVectors[IdxSample]
);
// ---- grad Q wrt pL (x_L) ----
MatrixOperation.MatrixTransposeMultiplyVector(
xi.DenseMatrixValue[IdxSample],
paramModel.Phi,
grad_Q_po_Sparse.SparseColumnVectors[IdxSample]
);
MatrixOperation.ScalarMultiplyVector(
xi.DenseMatrixValue[IdxSample],
-1.0f
);
MatrixOperation.ElementwiseVectorMultiplyVector(
TmpDenseMat.DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample],
DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample]
);
TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum();
MatrixOperation.ScalarAddVector(
xi.DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample],
(-1.0f) * TmpDenseRowVec.VectorValue[IdxSample]
);
break;
case "linearCE":
throw new Exception("linearCE is not implemented.");
//break;
default:
throw new Exception("Unknown OutputType");
}
// ***************************************************************************
// -------- Back propagation: hidden layers --------
for (int IdxLayer = DNNRun.nHidLayerEffective[IdxSample] - 1; IdxLayer >= 0; IdxLayer--)
{
// ---- Compute the position in the temporary variable for the current layer at the current sample ----
int IdxTmpVar = OffsetEffNumLayer[IdxSample] + IdxLayer;
// ---- grad wrt b ---
// Not implemented at the moment. (Can be used to update the Dirichlet parameter automatically.)
// ---- Compute the intermediate variables ----
MatrixOperation.ElementwiseVectorMultiplyVector(
tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar],
DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample]
);
if (IdxLayer == 0)
{
MatrixOperation.ElementwiseVectorDivideVector(
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar],
DNNRun.theta0.DenseMatrixValue[IdxSample]
);
}
else
{
MatrixOperation.ElementwiseVectorDivideVector(
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar],
DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]
);
}
if (IdxLayer == 0)
{
MatrixOperation.ElementwiseVectorDivideVector(
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
DNNRun.theta0.DenseMatrixValue[IdxSample]
);
}
else
{
MatrixOperation.ElementwiseVectorDivideVector(
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]
);
}
MatrixOperation.ElementwiseVectorMultiplyVector(
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
paramModel.b
);
MatrixOperation.ScalarMultiplyVector(
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample]
);
// Reset the elements to zero if theta_{l-1} is zero at these positions (mainly for alpha<1 case)
if (IdxLayer > 0)
{
MatrixOperation.ResetVectorSparsePattern(
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample],
DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]
);
}
// Continue to intermediate variable computation
if (IdxLayer == 0) // TmpSparseMat is Phitheta_lm1
{
MatrixOperation.MatrixMultiplyVector(
TmpSparseMat.SparseColumnVectors[IdxSample],
paramModel.Phi,
DNNRun.theta0.DenseMatrixValue[IdxSample]
);
}
else
{
MatrixOperation.MatrixMultiplyVector(
TmpSparseMat.SparseColumnVectors[IdxSample],
paramModel.Phi,
DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]
);
}
MatrixOperation.ElementwiseVectorDivideVector(
tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar],
Xt.SparseColumnVectors[IdxSample],
TmpSparseMat.SparseColumnVectors[IdxSample]
);
MatrixOperation.ElementwiseVectorDivideVector(
TmpSparseMat.SparseColumnVectors[IdxSample],
tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar],
TmpSparseMat.SparseColumnVectors[IdxSample]
); // TmpSparseMat is tmp_Xt_OVER_Phitheta2
MatrixOperation.MatrixMultiplyVector(
tmp_Phi_theta_xi.SparseColumnVectors[IdxSample],
paramModel.Phi,
tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar]
);
MatrixOperation.ElementwiseVectorMultiplyVector(
TmpSparseMat.SparseColumnVectors[IdxSample],
tmp_Phi_theta_xi.SparseColumnVectors[IdxSample]
); // TmpSparseMat is ( tmp_Phi_theta_xi.*tmp_Xt_OVER_Phitheta2 )
MatrixOperation.MatrixTransposeMultiplyVector(
TmpDenseMat.DenseMatrixValue[IdxSample],
paramModel.Phi,
TmpSparseMat.SparseColumnVectors[IdxSample]
);
MatrixOperation.ScalarMultiplyVector(
TmpDenseMat.DenseMatrixValue[IdxSample],
DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample]
); // TmpDenseMat is tmp_Tl_Phit_xtPhiTheta2_Phi_theta_xi
// ---- Compute the gradient wrt Phi ----
MatrixOperation.ScalarMultiplyVector(
tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar],
DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample]
);
MatrixOperation.ScalarMultiplyVector(
TmpSparseMat_pool.SparseColumnVectors[IdxTmpVar],
TmpSparseMat.SparseColumnVectors[IdxSample],
DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample]*(-1.0f)
);
if (IdxLayer == 0)
{
theta_l_minus_one.DenseMatrixValue[IdxTmpVar] = DNNRun.theta0.DenseMatrixValue[IdxSample];
}
else
{
theta_l_minus_one.DenseMatrixValue[IdxTmpVar] = DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample];
}
// ---- Compute xi_{l-1} via back propagation ----
if (IdxLayer > 0)
{
// Reset the elements to zero if theta_{l-1} is zero at these positions (mainly for alpha<1 case)
MatrixOperation.ElementwiseVectorDivideVector(
ThetaRatio.DenseMatrixValue[IdxSample],
DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample],
DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]
);
MatrixOperation.ResetVectorSparsePattern(
ThetaRatio.DenseMatrixValue[IdxSample],
DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]
);
MatrixOperation.ElementwiseVectorMultiplyVector(
xi.DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample],
ThetaRatio.DenseMatrixValue[IdxSample]
);
// Compute xi_{l-1} now
MatrixOperation.VectorSubtractVector(
TmpDenseMat.DenseMatrixValue[IdxSample],
xi.DenseMatrixValue[IdxSample],
TmpDenseMat.DenseMatrixValue[IdxSample]
);
MatrixOperation.VectorSubtractVector(
TmpDenseMat.DenseMatrixValue[IdxSample],
TmpDenseMat.DenseMatrixValue[IdxSample],
tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample]
);
MatrixOperation.ElementwiseVectorMultiplyVector(
tmp_theta_xi.DenseMatrixValue[IdxSample],
DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample],
TmpDenseMat.DenseMatrixValue[IdxSample]
); // tmp_theta_xi is tmp1 in matlab code
TmpDenseRowVec.VectorValue[IdxSample] = tmp_theta_xi.DenseMatrixValue[IdxSample].Sum();
MatrixOperation.ScalarAddVector(
xi.DenseMatrixValue[IdxSample],
TmpDenseMat.DenseMatrixValue[IdxSample],
TmpDenseRowVec.VectorValue[IdxSample] * (-1.0f)
);
}
}
});
// -------- Compute the gradients --------
// ---- Gradient with respect to U ----
DenseMatrix Theta_Top = new DenseMatrix(nHid, BatchSize);
for (int IdxSample = 0; IdxSample < BatchSize; ++IdxSample )
{
Theta_Top.DenseMatrixValue[IdxSample] = DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample];
}
switch (OutputType)
{
case "softmaxCE":
// ---- grad Q wrt U ----
MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_U, grad_Q_po, Theta_Top);
MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_U, (1.0f / (float)BatchSize));
break;
case "linearQuad":
// ---- grad Q wrt U ----
MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_U, grad_Q_po, Theta_Top);
MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_U, (1.0f / (float)BatchSize));
break;
case "unsupLDA":
// ---- grad Q wrt Phi on top ----
MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_TopPhi, grad_Q_po_Sparse, Theta_Top, false);
MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_TopPhi, Grad.grad_Q_TopPhi, (-1.0f / (float)BatchSize));
break;
case "linearCE":
throw new Exception("linearCE is not implemented.");
//break;
default:
throw new Exception("Unknown OutputType");
}
// ---- Gradient with respect to Phi ----
TmpGrad.SetAllValuesToZero();
MatrixOperation.MatrixMultiplyMatrixTranspose(TmpGrad, tmp_Xt_OVER_Phitheta_pool, tmp_theta_xi_pool, true);
MatrixOperation.MatrixMultiplyMatrixTranspose(TmpGrad, TmpSparseMat_pool, theta_l_minus_one, true);
MatrixOperation.ScalarMultiplyMatrix(TmpGrad, TmpGrad, (1.0f / (float)BatchSize));
MatrixOperation.MatrixAddMatrix(Grad.grad_Q_Phi, TmpGrad);
}