手写数字识别 神经网络 C++ 实现(三:ex4的实现)

目录

 

实现基本架构类

Part 1: Loading and Visualizing Data 

Part 2: Loading Parameters

Part 3: Compute Cost (Feedforward) 

Part 4: Implement Regularization

Part 5: Sigmoid Gradient

 Part 6: Initializing Pameters

Part 7: Implement Backpropagation

Part 8: Training NN

完整代码:


实现基本架构类

#include <core/core.hpp>    
#include <opencv2/opencv.hpp>
#include <iostream>  
#include <fstream>
#include <ostream>
#include <typeinfo>
#include <time.h>
using namespace std;
using namespace cv; 
class Nnetwork{
public:
	int visibleSize, hiddenSize, outputSize, layer_num;
	double lambda, cost, a;
	Mat data,//input
	pre_data,
	pre_outp,
	outp,//output
	*b, *W, *bgrad, *Wgrad, *active_value, *test_Wgrad, *test_bgrad, *av;
	void initParam();
	Nnetwork();
	Nnetwork(int visiblesize, int hiddensize, int outpsize, int layernums, double lambda) :
		visibleSize(visiblesize),
		hiddenSize(hiddensize),
		outputSize(outpsize),
		layer_num(layernums),
		lambda(lambda)
	{
		initParam();
	}
	Mat sigmoid(Mat matrix);
	double sigmoid(double num);
	Mat mat_exp(Mat r);
	Mat mat_log(Mat r);
	void forward_propagation();
	void showimage(Mat data, int pic_size, int num);
	void test_readdata();
	void test_readlabel();
	void test_load_Param();
	void test_nncost_1();
	void test_nncost_2();
	double test_nncost_3(int lambda, Mat *active_value, Mat *b, Mat *W);
	Mat sigmoidGradient(Mat inp);
	void writeMatToFile(cv::Mat& m, const char* filename);
	void computeNumericalGradient();
	Mat debugInitializeWeights(int fan_out, int fan_in);
	void checkNNGradients();
	void train();
	double predict();
	double pre_dict();
	void before_train();
};

Part 1: Loading and Visualizing Data 

Part 2: Loading Parameters

参见上一节:https://blog.****.net/Runner_of_nku/article/details/88815894

Part 3: Compute Cost (Feedforward) 

Part 4: Implement Regularization

这一步的前向传播是读取的上一节中的参数,我们需要实现的是代价函数,代码如下:

	void test_nncost_1()
	{
		delete[]active_value;
		active_value = new Mat[2];
		int data_size = outp.rows;
		active_value[0] = repeat(b[0], 1, data_size);
		active_value[1] = repeat(b[1], 1, data_size);
		active_value[0] = sigmoid(W[0]*data.t()+active_value[0]);
		active_value[1] = sigmoid(W[1]*active_value[0] + active_value[1]);
		Mat yk = Mat::zeros(10, data_size, CV_64FC1);
		for (int i = 0; i < data_size; i++)
			yk.at<double>(int(outp.at<double>(i, 0))-1,i) = 1;

		double J = sum((-1 * yk).mul(mat_log(active_value[1])) - (1 - yk).mul(mat_log(1 - active_value[1])))[0]/data_size;

		cout << "Cost at parameters (loaded from ex4weights)\n(this value should be about 0.287629)\n" << J<<endl;

                lambda=1;
		J += lambda / 2 / data_size * (sum(W[0].mul(W[0]))[0] + sum(W[1].mul(W[1]))[0]);
		cout << "Cost at parameters (loaded from ex4weights)\n(this value should be about 0.383770)\n" << J<<endl;
		cost = J;
		Mat delta3 = (active_value[1] - yk);
		Mat tem = (delta3.t()*W[1]).t();
		Mat delta2 = tem.mul(active_value[0]).mul(1 - active_value[0]);

		Wgrad[1] = delta3*active_value[0].t() / data_size + lambda*W[1] / data_size;
		Wgrad[0] = delta2*data / data_size + lambda*W[0] / data_size;
		bgrad[1] = Mat(delta3.rows, 1, CV_64FC1, Scalar::all(0));
		bgrad[0] = Mat(delta2.rows, 1, CV_64FC1, Scalar::all(0));
		reduce(delta3, bgrad[1], 1, 1);
		reduce(delta2, bgrad[0], 1, 1);

	}

手写数字识别 神经网络 C++ 实现(三:ex4的实现)

Part 5: Sigmoid Gradient

这一节很简单,实现sigmoid函数的求导,我们在实际计算的时候可以直接写sigmoid(x)*(1-sigmoid(x))即可

 Part 6: Initializing Pameters

ex4中的随机数是直接给出了0.12,实际上这个0.12是怎么算出来的呢:

sqrt(6) / sqrt(hiddenSize + visibleSize + 1) ≈ 0.12

	void initParam()
	{
		a = 0.2;
		b = new Mat[layer_num];
		W = new Mat[layer_num];
		b[0] = Mat(hiddenSize, 1, CV_64FC1, Scalar(0));
		b[layer_num - 1] = Mat(outputSize, 1, CV_64FC1, Scalar(0));
		W[0] = Mat(hiddenSize, visibleSize, CV_64FC1);
		W[layer_num - 1] = Mat(outputSize, hiddenSize, CV_64FC1);
		for (int i = 1; i < layer_num - 1; i++)
		{
			W[i] = Mat(hiddenSize, hiddenSize, CV_64FC1);
			b[i] = Mat(hiddenSize, 1, CV_64FC1, Scalar(0));
		}
		double r = sqrt(6) / sqrt(hiddenSize + visibleSize + 1);
		for (int i = 0; i < layer_num; i++)
		{
			randu(W[i], Scalar::all(0), Scalar::all(1));
			randu(b[i], Scalar::all(0), Scalar::all(1));
			W[i] = W[i] * 2 * r - r;
			b[i] = b[i] * 2 * r - r;
		}

		bgrad = new Mat[layer_num]; Wgrad = new Mat[layer_num];
		for (int i = 0; i < layer_num; i++)
		{
			bgrad[i] = Mat::zeros(b[i].size(),CV_64FC1);
			Wgrad[i] = Mat::zeros(W[i].size(), CV_64FC1);
		}
		cost = 0;
	}

Part 7: Implement Backpropagation

实现反向传播,这一步我们连同梯度检查一起实现。

我们定义了一个3*5*3的一个微型神经网络,输入集是用sin函数确定的,输出集是1-3的标签,我们可以利用该微型神经网络对自己的代价函数进行debug

	void checkNNGradients()
	{
		int input_layer_size = 3,hidden_layer_size = 5,num_labels = 3,m = 5;
		Mat Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size),//size 5*4
			Theta2 = debugInitializeWeights(num_labels, hidden_layer_size),//3*6
			X = debugInitializeWeights(m, input_layer_size - 1),//5*3
			y = Mat(m, 1, CV_64FC1);
		for (int i = 1; i <= m; i++)
		{
			y.at<double>(i - 1, 0) = i%num_labels+1;
		}
		//cout << y;
		Nnetwork checknn(input_layer_size, hidden_layer_size, num_labels, 2,lambda);
		checknn.data = X;
		checknn.outp = y;
		checknn.W[0] = Mat::zeros(hidden_layer_size, input_layer_size, CV_64FC1);//5*3
		checknn.W[1] = Mat(num_labels, hidden_layer_size, CV_64FC1);//3*5
		checknn.b[0] = Mat(hidden_layer_size, 1, CV_64FC1);//5*1
		checknn.b[1] = Mat(num_labels, 1, CV_64FC1);//3*1
		for (int i = 0; i < Theta1.rows; i++)
			for (int j = 1; j < Theta1.cols; j++)
				checknn.W[0].at<double>(i, j-1) = Theta1.at<double>(i,j);
		for (int j = 0; j < Theta1.rows; j++)
			checknn.b[0].at<double>(j, 0) = Theta1.at<double>(j, 0);
		for (int i = 0; i < Theta2.rows; i++)
			for (int j = 1; j < Theta2.cols; j++)
				checknn.W[1].at<double>(i, j - 1) = Theta2.at<double>(i, j);
		for (int j = 0; j < Theta2.rows; j++)
			checknn.b[1].at<double>(j, 0) = Theta1.at<double>(j, 0);



		checknn.test_nncost_2();
		checknn.computeNumericalGradient();
		
	}
	void computeNumericalGradient()
	{
		double lambda = 0, e = 0.0001; int count = 0;
		Mat *WW, *bb, *ac;
		ac = new Mat[layer_num];
		WW = new Mat[layer_num];
		bb = new Mat[layer_num];
		test_bgrad = new Mat[layer_num];
		test_Wgrad = new Mat[layer_num];
		for (int i = 0; i < layer_num; i++)
		{
			WW[i] = W[i].clone();
			bb[i] = b[i].clone();
		}
		double diff1 = 0, diff2 = 0;
		for (int i = 0; i < layer_num; i++)
		{
			test_bgrad[i] = Mat(bgrad[i].size(), CV_64FC1);
			for (int j = 0; j < bb[i].rows; j++)
			{
				for (int k = 0; k < bb[i].cols; k++)
				{
					bb[i].at<double>(j, k) += e;
					double loss1 = test_nncost_3(lambda, ac, bb, WW);
					bb[i].at<double>(j, k) -= 2*e;
					double loss2 = test_nncost_3(lambda, ac, bb, WW);
					bb[i].at<double>(j, k) += e;
					test_bgrad[i].at<double>(j, k) = (loss1 - loss2) / 2 / e;
				}
			}
			diff1 = sum((test_bgrad[i] - bgrad[i]).mul(test_bgrad[i] - bgrad[i]))[0];
			diff2 = sum((test_bgrad[i] + bgrad[i]).mul(test_bgrad[i] + bgrad[i]))[0];
		}
		for (int i = 0; i < layer_num; i++)
		{
			test_Wgrad[i] = Mat(Wgrad[i].size(), CV_64FC1);
			for (int j = 0; j < WW[i].rows; j++)
			{
				for (int k = 0; k < WW[i].cols; k++)
				{
					WW[i].at<double>(j, k) += e;
					double loss1 = test_nncost_3(lambda, ac, bb, WW);
					WW[i].at<double>(j, k) -= 2 * e;
					double loss2 = test_nncost_3(lambda, ac, bb, WW);
					WW[i].at<double>(j, k) += e;
					test_Wgrad[i].at<double>(j, k) = (loss1 - loss2) / 2 / e;
				}
			}
			diff1 = sum((test_Wgrad[i] - Wgrad[i]).mul(test_Wgrad[i] - Wgrad[i]))[0];
			diff2 = sum((test_Wgrad[i] + Wgrad[i]).mul(test_Wgrad[i] + Wgrad[i]))[0];
		}
		cout << "diff is : " << sqrt(diff1 / diff2)<<endl;

	}
void test_nncost_2()
	{
		active_value = new Mat[2];
		int data_size = outp.rows;
		active_value[0] = repeat(b[0], 1, data_size);
		active_value[1] = repeat(b[1], 1, data_size);
		cout << "sizeof av0 : " << active_value[0].size() << endl;
		cout << "sizeof av1 : " << active_value[1].size() << endl;
		cout << "sizeof data : " << data.size() << endl;
		cout << "sizeof W[0] : " << W[0].size() << endl;
		cout << "sizeof W[1] : " << W[1].size() << endl;
		active_value[0] = sigmoid(W[0] * data.t() + active_value[0]);
		active_value[1] = sigmoid(W[1] * active_value[0] + active_value[1]);
		
		Mat yk = Mat::zeros(3, data_size, CV_64FC1);
		for (int i = 0; i < data_size; i++)
				yk.at<double>(int(outp.at<double>(i, 0)) - 1, i) = 1;
		//cout << mat_log(active_value[1]) << endl;
		double J = sum((-1 * yk).mul(mat_log(active_value[1])) - (1 - yk).mul(mat_log(1 - active_value[1])))[0] / data_size;

		cout << "Cost at parameters \n(this value should be about 2.10095)\n" << J << endl;
		cost = J;
		J += lambda / 2 / data_size * (sum(W[0].mul(W[0]))[0] + sum(W[1].mul(W[1]))[0]);
		cout << "Cost at parameters \n(this value should be about )\n" << 0.0151364*lambda<< J - cost << endl;
		cost = J;

		Mat delta3 = (active_value[1] - yk);
		Mat tem = (delta3.t()*W[1]).t();
		Mat delta2 = tem.mul(active_value[0]).mul(1 - active_value[0]);

		Wgrad[1] = delta3*active_value[0].t() / data_size +lambda*W[1]/data_size;
		Wgrad[0] = delta2*data / data_size + lambda*W[0]/data_size;
		bgrad[1] = Mat(delta3.rows, 1, CV_64FC1, Scalar::all(0));
		bgrad[0] = Mat(delta2.rows, 1, CV_64FC1, Scalar::all(0));
		reduce(delta3, bgrad[1], 1, 1);
		reduce(delta2, bgrad[0], 1, 1);
		//cout << Wgrad[0] << endl;
	}

Part 8: Training NN

训练我们可没有matlab中fmincg这种nb的函数,只能自己手动迭代梯度下降了,这里我的建议是初始步长设置为0.2(a),当cost下降到1以下时设置为0.1,这样稍微快一点,如果你设置为0.1,需要迭代很长时间的。

这里我们选择4000个样本作为训练集,1000作为预测集

迭代2500次结果如下:

手写数字识别 神经网络 C++ 实现(三:ex4的实现)

手写数字识别 神经网络 C++ 实现(三:ex4的实现)

完整代码:

#include <core/core.hpp>    
#include <opencv2/opencv.hpp>
#include <iostream>  
#include <fstream>
#include <ostream>
#include <typeinfo>
#include <time.h>
using namespace std;
using namespace cv;
class Nnetwork{
public:
	int visibleSize, hiddenSize, outputSize, layer_num;
	double lambda, cost, a;
	Mat data;//input
	Mat pre_data;
	Mat pre_outp;
	Mat outp;//output
	Mat *b, *W, *bgrad, *Wgrad, *active_value, *test_Wgrad, *test_bgrad,*av;
	void initParam()
	{
		a = 0.2;
		b = new Mat[layer_num];
		W = new Mat[layer_num];
		b[0] = Mat(hiddenSize, 1, CV_64FC1, Scalar(0));
		b[layer_num - 1] = Mat(outputSize, 1, CV_64FC1, Scalar(0));
		W[0] = Mat(hiddenSize, visibleSize, CV_64FC1);
		W[layer_num - 1] = Mat(outputSize, hiddenSize, CV_64FC1);
		for (int i = 1; i < layer_num - 1; i++)
		{
			W[i] = Mat(hiddenSize, hiddenSize, CV_64FC1);
			b[i] = Mat(hiddenSize, 1, CV_64FC1, Scalar(0));
		}
		double r = sqrt(6) / sqrt(hiddenSize + visibleSize + 1);
		for (int i = 0; i < layer_num; i++)
		{
			randu(W[i], Scalar::all(0), Scalar::all(1));
			randu(b[i], Scalar::all(0), Scalar::all(1));
			W[i] = W[i] * 2 * r - r;
			b[i] = b[i] * 2 * r - r;
		}

		bgrad = new Mat[layer_num]; Wgrad = new Mat[layer_num];
		for (int i = 0; i < layer_num; i++)
		{
			bgrad[i] = Mat::zeros(b[i].size(),CV_64FC1);
			Wgrad[i] = Mat::zeros(W[i].size(), CV_64FC1);
		}
		cost = 0;
	}
	Nnetwork()
	{
		//默认构造函数默认只有一个隐含层
		layer_num = 2;//输入+隐含层一共的层数,如果只有一个隐含层则值为2
		visibleSize = 3;
		hiddenSize = 3;
		outputSize = 1;
		lambda = 0.0001;
		initParam();
	}
	Nnetwork(int visiblesize, int hiddensize, int outpsize, int layernums, double lambda) :
		visibleSize(visiblesize),
		hiddenSize(hiddensize),
		outputSize(outpsize),
		layer_num(layernums),
		lambda(lambda)
	{
		initParam();
	}
	Mat sigmoid(Mat matrix)
	{
		return 1.0 / (1.0 + mat_exp(-1 * matrix));
	}
	double sigmoid(double num)
	{
		return 1.0 / (1.0 + exp(-1 * num));
	}
	Mat mat_exp(Mat r)
	{
		Mat tmp = Mat(r.size(), CV_64FC1);
		for (int i = 0; i < r.rows; i++)
		{
			for (int j = 0; j < r.cols; j++)
			{
				tmp.at<double>(i, j) = exp(r.at<double>(i, j));
			}
		}
		return tmp;
	}
	Mat mat_log(Mat r)
	{
		Mat tmp = Mat(r.size(),CV_64FC1);
		for (int i = 0; i < r.rows; i++)
		{
			for (int j = 0; j < r.cols; j++)
			{
				tmp.at<double>(i, j) = log(r.at<double>(i, j));
			}
		}
		return tmp;
	}
	void forward_propagation()
	{
		int data_size = data.cols;
		delete[] active_value;
		active_value = new Mat[layer_num];
		for (int i = 0; i < layer_num; i++)
			active_value[i] = repeat(b[i], 1, data_size);
		active_value[0] = sigmoid(W[0] * data + active_value[0]);
		for (int i = 1; i < layer_num; i++)
			active_value[i] = sigmoid(W[i] * active_value[i - 1] + active_value[i]);	
	}
	void showimage(Mat data,int pic_size,int num)
	{

		if (data.empty()) return;
		Mat image = Mat::ones(num*pic_size, num*pic_size, CV_64FC1);
		int res = 0;
		for (int i = 0; i < num; i++)
		{
			for (int j = 0; j < num; j++)
			{
				int c = 0;
				for (int r = pic_size * i; r<pic_size * i + pic_size; r++)
				{
					for (int l = j * pic_size; l<pic_size * j + pic_size; l++)
					{
						image.at<double>(r, l) = data.at<double>(i * num + j, c++);
					}
				}
			}
		}
		imshow("image", image.t());
		waitKey();
	}
	void test_readdata()
	{
		char* path = "C:\\Users\\ACER\\Documents\\MATLAB\\dataset";
		ifstream is(path, ios::in | ios::binary);

		data = Mat(5000, 20 * 20, CV_64FC1);

		if (!is) {
			cout << "cannot open file: " << path << endl;
		}
		else {
			if (is) {
				double temp = 0;
				for (int i = 0; i <5000; ++i) {
					Mat image = Mat::zeros(20, 20, CV_64FC1);

					for (int r = 0; r < 20; ++r) {
						for (int c = 0; c < 20; ++c) {
							is.read((char*)&temp, sizeof(temp));
							data.at<double>(i, r * 20 + c) = (double)temp;
						}
					}
					cout << "read data : " << i << endl;
				}
			}
		}
		is.close();
		//showimage(data, 20,70);
	}
	void test_readlabel()
	{
		int test[11];
		memset(test, 0, sizeof(test));
		char* path = "C:\\Users\\ACER\\Desktop\\train-labels-idx1-ubyte\\test";
		ifstream is(path, ios::in | ios::binary);

		vector<double> labels;
		outp = Mat(5000, 1, CV_64FC1);
		if (!is) {
			cout << "cannot open file: " << path << endl;
		}
		else {
			if (is) {
				cout << "read successful " << endl;
				unsigned char temp = 0;
				int size = sizeof(temp);
				for (int i = 0; i < 5000; ++i) {
					is.read((char*)&temp, size);
					labels.push_back((double)temp);
					outp.at<double>(i, 0) = labels[i];
					test[int(labels[i])]++;
				}
			}
		}
		is.close();

		for (int i = 1; i < 11; i++)
		{
			cout << i << " : " << test[i] << endl;
		}
	}
	void test_load_Param()
	{
		char* path = "C:\\Users\\ACER\\Documents\\MATLAB\\param";
		ifstream is(path, ios::in | ios::binary);

		b = new Mat[2];
		W = new Mat[2];
		W[0] = Mat(25, 400, CV_64FC1);
		W[1] = Mat(10, 25, CV_64FC1);
		b[0] = Mat(25, 1, CV_64FC1);
		b[1] = Mat(10, 1, CV_64FC1);
		int index = 0;
		if (!is) {
			cout << "cannot open file: " << path << endl;
		}
		else {
			if (is) {
				double temp = 0;
				for (int r = 0; r < 25; r++)
				{
					is.read((char*)&temp, sizeof(temp));
					b[0].at<double>(r, 0) = double(temp);
				}
				for (int l = 0; l < 400; l++)
				{
					for (int r = 0; r < 25; r++)
					{
						is.read((char*)&temp, sizeof(temp));
						W[0].at<double>(r, l) = double(temp);
					}
				}
				for (int r = 0; r < 10; r++)
				{
					is.read((char*)&temp, sizeof(temp));
					b[1].at<double>(r, 0) = double(temp);
					//cout << double(temp);
				}
				for (int l = 0; l < 25; l++)
				{
					for (int r = 0; r < 10; r++)
					{

						is.read((char*)&temp, sizeof(temp));
						W[1].at<double>(r, l) = double(temp);
					}
				}

			}
		}
		is.close();
	}
	void test_nncost_1()
	{
		//a1 = [ones(m, 1) X];
		//z1 = a1*Theta1';
		//	temp = sigmoid(z1);
		//a2 = [ones(m, 1), temp];
		//z2 = a2*Theta2';
		//	h = sigmoid(z2);
		//%计算初始J的值
		//for k = 1:num_labels
		//	yk = y == k;
		//hthetak = h(:, k);
		//Jk = 1 / m * sum(-yk.*log(hthetak) - (1 - yk).*log(1 - hthetak));
		//J = J + Jk;
		//end
		delete[]active_value;
		active_value = new Mat[2];
		int data_size = outp.rows;
		active_value[0] = repeat(b[0], 1, data_size);
		active_value[1] = repeat(b[1], 1, data_size);
		//cout << "sizeof av0 : " << active_value[0].size()<<endl;
		//cout << "sizeof av1 : " << active_value[1].size() << endl;
		//cout << "sizeof data : " << data.size() << endl;
		active_value[0] = sigmoid(W[0]*data.t()+active_value[0]);
		active_value[1] = sigmoid(W[1]*active_value[0] + active_value[1]);
		//cout << endl<<active_value[1].at<double>(5, 5);
		//for k = 1:num_labels
		//	yk = y == k;
		//hthetak = h(:, k);
		//Jk = 1 / m * sum(-yk.*log(hthetak) - (1 - yk).*log(1 - hthetak));
		//J = J + Jk;
		//end
		Mat yk = Mat::zeros(10, data_size, CV_64FC1);
		for (int i = 0; i < data_size; i++)
			yk.at<double>(int(outp.at<double>(i, 0))-1,i) = 1;

		double J = sum((-1 * yk).mul(mat_log(active_value[1])) - (1 - yk).mul(mat_log(1 - active_value[1])))[0]/data_size;

		//cout << "Cost at parameters (loaded from ex4weights)\n(this value should be about 0.287629)\n" << J<<endl;

		lambda = 1;
		J += lambda / 2 / data_size * (sum(W[0].mul(W[0]))[0] + sum(W[1].mul(W[1]))[0]);
		//cout << "Cost at parameters (loaded from ex4weights)\n(this value should be about 0.383770)\n" << J<<endl;
		cost = J;
		Mat delta3 = (active_value[1] - yk);
		Mat tem = (delta3.t()*W[1]).t();
		Mat delta2 = tem.mul(active_value[0]).mul(1 - active_value[0]);

		Wgrad[1] = delta3*active_value[0].t() / data_size + lambda*W[1] / data_size;
		Wgrad[0] = delta2*data / data_size + lambda*W[0] / data_size;
		bgrad[1] = Mat(delta3.rows, 1, CV_64FC1, Scalar::all(0));
		bgrad[0] = Mat(delta2.rows, 1, CV_64FC1, Scalar::all(0));
		reduce(delta3, bgrad[1], 1, 1);
		reduce(delta2, bgrad[0], 1, 1);

	}
	void test_nncost_2()
	{
		active_value = new Mat[2];
		int data_size = outp.rows;
		active_value[0] = repeat(b[0], 1, data_size);
		active_value[1] = repeat(b[1], 1, data_size);
		cout << "sizeof av0 : " << active_value[0].size() << endl;
		cout << "sizeof av1 : " << active_value[1].size() << endl;
		cout << "sizeof data : " << data.size() << endl;
		cout << "sizeof W[0] : " << W[0].size() << endl;
		cout << "sizeof W[1] : " << W[1].size() << endl;
		active_value[0] = sigmoid(W[0] * data.t() + active_value[0]);
		active_value[1] = sigmoid(W[1] * active_value[0] + active_value[1]);
		
		Mat yk = Mat::zeros(3, data_size, CV_64FC1);
		for (int i = 0; i < data_size; i++)
				yk.at<double>(int(outp.at<double>(i, 0)) - 1, i) = 1;
		//cout << mat_log(active_value[1]) << endl;
		double J = sum((-1 * yk).mul(mat_log(active_value[1])) - (1 - yk).mul(mat_log(1 - active_value[1])))[0] / data_size;

		cout << "Cost at parameters \n(this value should be about 2.10095)\n" << J << endl;
		cost = J;
		J += lambda / 2 / data_size * (sum(W[0].mul(W[0]))[0] + sum(W[1].mul(W[1]))[0]);
		cout << "Cost at parameters \n(this value should be about )\n" << 0.0151364*lambda<< J - cost << endl;
		cost = J;

		Mat delta3 = (active_value[1] - yk);
		Mat tem = (delta3.t()*W[1]).t();
		Mat delta2 = tem.mul(active_value[0]).mul(1 - active_value[0]);

		Wgrad[1] = delta3*active_value[0].t() / data_size +lambda*W[1]/data_size;
		Wgrad[0] = delta2*data / data_size + lambda*W[0]/data_size;
		bgrad[1] = Mat(delta3.rows, 1, CV_64FC1, Scalar::all(0));
		bgrad[0] = Mat(delta2.rows, 1, CV_64FC1, Scalar::all(0));
		reduce(delta3, bgrad[1], 1, 1);
		reduce(delta2, bgrad[0], 1, 1);
		//cout << Wgrad[0] << endl;
	}
	double test_nncost_3(int lambda,Mat *active_value,Mat *b, Mat *W)
	{
		active_value = new Mat[2];
		int data_size = outp.rows;
		active_value[0] = repeat(b[0], 1, data_size);
		active_value[1] = repeat(b[1], 1, data_size);

		active_value[0] = sigmoid(W[0] * data.t() + active_value[0]);
		active_value[1] = sigmoid(W[1] * active_value[0] + active_value[1]);

		Mat yk = Mat::zeros(3, data_size, CV_64FC1);
		for (int i = 0; i < data_size; i++)
			yk.at<double>(int(outp.at<double>(i, 0)) - 1, i) = 1;
		double J = sum((-1 * yk).mul(mat_log(active_value[1])) - (1 - yk).mul(mat_log(1 - active_value[1])))[0] / data_size;
		J += lambda / 2 / data_size * (sum(W[0].mul(W[0]))[0] + sum(W[1].mul(W[1]))[0]);

		return J;	
	}
	Mat sigmoidGradient(Mat inp)
	{
		return sigmoid(inp).mul(1 - sigmoid(inp));
	}
	void writeMatToFile(cv::Mat& m, const char* filename)
	{
		std::ofstream fout(filename);

		if (!fout)
		{
			std::cout << "File Not Opened" << std::endl;
			return;
		}

		for (int i = 0; i<m.rows; i++)
		{
			for (int j = 0; j<m.cols; j++)
			{
				fout << m.at<double>(i, j) << "\t";
			}
			fout << std::endl;
		}

		fout.close();
	}
	void computeNumericalGradient()
	{
		double lambda = 0, e = 0.0001; int count = 0;
		Mat *WW, *bb, *ac;
		ac = new Mat[layer_num];
		WW = new Mat[layer_num];
		bb = new Mat[layer_num];
		test_bgrad = new Mat[layer_num];
		test_Wgrad = new Mat[layer_num];
		for (int i = 0; i < layer_num; i++)
		{
			WW[i] = W[i].clone();
			bb[i] = b[i].clone();
		}
		double diff1 = 0, diff2 = 0;
		for (int i = 0; i < layer_num; i++)
		{
			test_bgrad[i] = Mat(bgrad[i].size(), CV_64FC1);
			for (int j = 0; j < bb[i].rows; j++)
			{
				for (int k = 0; k < bb[i].cols; k++)
				{
					bb[i].at<double>(j, k) += e;
					double loss1 = test_nncost_3(lambda, ac, bb, WW);
					bb[i].at<double>(j, k) -= 2*e;
					double loss2 = test_nncost_3(lambda, ac, bb, WW);
					bb[i].at<double>(j, k) += e;
					test_bgrad[i].at<double>(j, k) = (loss1 - loss2) / 2 / e;
				}
			}
			diff1 = sum((test_bgrad[i] - bgrad[i]).mul(test_bgrad[i] - bgrad[i]))[0];
			diff2 = sum((test_bgrad[i] + bgrad[i]).mul(test_bgrad[i] + bgrad[i]))[0];
		}
		for (int i = 0; i < layer_num; i++)
		{
			test_Wgrad[i] = Mat(Wgrad[i].size(), CV_64FC1);
			for (int j = 0; j < WW[i].rows; j++)
			{
				for (int k = 0; k < WW[i].cols; k++)
				{
					WW[i].at<double>(j, k) += e;
					double loss1 = test_nncost_3(lambda, ac, bb, WW);
					WW[i].at<double>(j, k) -= 2 * e;
					double loss2 = test_nncost_3(lambda, ac, bb, WW);
					WW[i].at<double>(j, k) += e;
					test_Wgrad[i].at<double>(j, k) = (loss1 - loss2) / 2 / e;
				}
			}
			diff1 = sum((test_Wgrad[i] - Wgrad[i]).mul(test_Wgrad[i] - Wgrad[i]))[0];
			diff2 = sum((test_Wgrad[i] + Wgrad[i]).mul(test_Wgrad[i] + Wgrad[i]))[0];
		}
		cout << "diff is : " << sqrt(diff1 / diff2)<<endl;

	}
	Mat debugInitializeWeights(int fan_out, int fan_in)
	{ 
		Mat res = Mat::zeros(fan_out, fan_in + 1,CV_64FC1);
		int c = 1;
		for (int i = 0; i < res.cols; i++)
		{
			for (int j = 0; j < res.rows; j++)
			{
				res.at<double>(j, i) = sin(c++);
			}
		}
		return res / 10;
	}
	void checkNNGradients()
	{
		int input_layer_size = 3,hidden_layer_size = 5,num_labels = 3,m = 5;
		Mat Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size),//size 5*4
			Theta2 = debugInitializeWeights(num_labels, hidden_layer_size),//3*6
			X = debugInitializeWeights(m, input_layer_size - 1),//5*3
			y = Mat(m, 1, CV_64FC1);
		for (int i = 1; i <= m; i++)
		{
			y.at<double>(i - 1, 0) = i%num_labels+1;
		}
		//cout << y;
		Nnetwork checknn(input_layer_size, hidden_layer_size, num_labels, 2,lambda);
		checknn.data = X;
		checknn.outp = y;
		checknn.W[0] = Mat::zeros(hidden_layer_size, input_layer_size, CV_64FC1);//5*3
		checknn.W[1] = Mat(num_labels, hidden_layer_size, CV_64FC1);//3*5
		checknn.b[0] = Mat(hidden_layer_size, 1, CV_64FC1);//5*1
		checknn.b[1] = Mat(num_labels, 1, CV_64FC1);//3*1
		for (int i = 0; i < Theta1.rows; i++)
			for (int j = 1; j < Theta1.cols; j++)
				checknn.W[0].at<double>(i, j-1) = Theta1.at<double>(i,j);
		for (int j = 0; j < Theta1.rows; j++)
			checknn.b[0].at<double>(j, 0) = Theta1.at<double>(j, 0);
		for (int i = 0; i < Theta2.rows; i++)
			for (int j = 1; j < Theta2.cols; j++)
				checknn.W[1].at<double>(i, j - 1) = Theta2.at<double>(i, j);
		for (int j = 0; j < Theta2.rows; j++)
			checknn.b[1].at<double>(j, 0) = Theta1.at<double>(j, 0);



		checknn.test_nncost_2();
		checknn.computeNumericalGradient();
		
	}
	void train()
	{
		a=cost<1?0.1:0.2;//step
		test_nncost_1();
		for (int i = 0; i < layer_num; i++)
		{
			W[i] = W[i] - Wgrad[i] * a;
			b[i] = b[i] - bgrad[i] * a;
		}
		cout << "cost is " << cost << "   precise is "<<pre_dict()*100<<"%";
	}
	double predict()
	{
		double count = 0;
		for (int i = 0; i < active_value[1].cols; i++)
		{
			double max = 0; int index ;
			for (int j = 0; j < active_value[1].rows; j++)
			{
				if (active_value[1].at<double>(j,i)>max)
				{
					index = j;
					max = active_value[1].at<double>(j,i);
				}
			}
			if (index == int(outp.at<double>(i, 0) - 1)) ++count;
		}
		return count / active_value[1].cols;
	}
	double pre_dict()
	{
		delete[]av;
		av = new Mat[2];
		int data_size = pre_outp.rows;
		av[0] = repeat(b[0], 1, data_size);
		av[1] = repeat(b[1], 1, data_size);
		av[0] = sigmoid(W[0] * pre_data.t() + av[0]);
		av[1] = sigmoid(W[1] * av[0] + av[1]);
		double count = 0;
		for (int i = 0; i < av[1].cols; i++)
		{
			double max = 0; int index;
			for (int j = 0; j < av[1].rows; j++)
			{
				if (av[1].at<double>(j, i)>max)
				{
					index = j;
					max = av[1].at<double>(j, i);
				}
			}
			if (index == int(pre_outp.at<double>(i, 0) - 1)) ++count;
		}
		return count / av[1].cols;
	}
	void before_train()
	{
		Mat train_data = Mat(4000, 400, CV_64FC1);
		Mat train_outp = Mat(4000, 1, CV_64FC1);
		int c = 0;
		for (int i = 0; i < 10; i++)
		{
			for (int j = i * 500; j < i * 500 + 400; j++)
			{
				for (int k = 0; k < 400; k++)
				{
					train_data.at<double>(c, k) = data.at<double>(j, k);
				}
				train_outp.at<double>(c++, 0) = outp.at<double>(j, 0);

			}
		}
		c = 0;
		pre_data = Mat(1000, 400, CV_64FC1);
		pre_outp = Mat(1000, 1, CV_64FC1);
		for (int i = 0; i < 10; i++)
		{
			for (int j = i * 500 + 400; j < i * 500 + 500; j++)
			{
				for (int k = 0; k < 400; k++)
				{
					pre_data.at<double>(c, k) = data.at<double>(j, k);
				}
				pre_outp.at<double>(c++, 0) = outp.at<double>(j, 0);
				
			}
		}
		data = train_data;
		outp = train_outp;
	}
};

int main()
{
	Nnetwork tt(400,25,10,2,3);
	//step_1:Loading and Visualizing Data

	tt.test_readdata();
	tt.test_readlabel();
	tt.showimage(tt.data, 20, 70);
	{
		//step_2:Loading Parameters
		tt.test_load_Param();

		//step_3:Compute Cost (Feedforward)
		tt.test_nncost_1();
		//step_4:Implement Regularization
		//step_5:Sigmoid Gradient

		//step_6:Initializing Pameters 
		tt.initParam();
		//step_7:Implement Backpropagation
		tt.lambda = 0;
		tt.checkNNGradients();
		//step_8:Implement Regularization
		tt.lambda = 1;
		tt.checkNNGradients();
	}
	//step_9:Training NN

	tt.initParam();
	tt.before_train();
	int it = 2500, i = it;


	cout << endl<< "计时开始" << endl;

	clock_t start, finish;
	double totaltime;
	start = clock();

	//耗时代码
	Mat A = Mat(100, 100, CV_64FC1, Scalar::all(1.1)),
		B = Mat(100, 100, CV_64FC1, Scalar::all(1.1));
	while (i--)
	{
		cout << endl << "iter " << it - i << " : ";
		tt.train();
		//A *= B;
	}
	finish = clock();
	totaltime = (double)(finish - start) / CLOCKS_PER_SEC;
	cout << "\n此程序的运行时间为" << totaltime << "秒!" << endl;
}