Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
添加说明
  • Loading branch information
1mrliu committed Jan 15, 2018
commit ed2c1f6d361a0d3dd0af8bf6dad4a814ac86be07
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个文件可以不需要提交滴

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已经删除这个文件

"python.pythonPath": "/usr/local/bin/python3.6"
}
23 changes: 14 additions & 9 deletions src/python/getting-started/digit-recognizer/dr-python-3.6.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def opencsv():
return trainData, trainLabel, testData


# 数据预处理-降维
# 数据预处理-降维 PCA主成成分分析
def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
print('dimensionality reduction...')
trainData = np.array(x_train)
Expand Down Expand Up @@ -65,27 +65,32 @@ def saveResult(result, csvName):
for r in result:
index += 1
myWriter.writerow([index, int(r)])

print('Saved successfully...') # 保存预测结果


# 分析数据
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
def analyse_data(dataMat):
meanVals = np.mean(dataMat, axis=0) # np.mean 求平均值
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
#计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
#cov(X,0) = cov(X) 除数是n-1(n为样本个数)
#cov(X,1) 除数是n
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
# np.mat 是用来生成一个矩阵的
# 保存特征值(eigvals)和对应的特征向量(eigVects)
eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中
eigValInd = np.argsort(eigvals) # argsort 对特征值矩阵进行排序,返回的是数值从小到大的索引值
eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值

topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数

topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数
eigValInd = eigValInd[:-(topNfeat+1):-1] # 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值,返回其对应的索引
# 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值,返回其对应的索引
eigValInd = eigValInd[:-(topNfeat+1):-1]

# 计算特征值的总和
cov_all_score = float(sum(eigvals))
sum_cov_score = 0
for i in range(0, len(eigValInd)):
# 特征值进行相加
line_cov_score = float(eigvals[eigValInd[i]])
sum_cov_score += line_cov_score
'''
Expand All @@ -105,7 +110,7 @@ def getOptimalAccuracy(trainData, trainLabel, preData):
# 分析数据 100个特征左右
# analyse_data(trainData)
x_train, x_test, y_train, y_test = train_test_split(trainData, trainLabel, test_size=0.1)
lineLen, featureLen = np.shape(x_test)
lineLen, featureLen = np.shape(x_test) # shape 返回矩阵或者数值的长度
# print(lineLen, type(lineLen), featureLen, type(featureLen))

minErr = 1
Expand Down