1+ # -*- coding: utf-8 -*-
2+ """
3+ Created on Mon Jul 30 14:17:56 2018
4+
5+ @author: wzy
6+ """
7+ import numpy as np
8+ from bs4 import BeautifulSoup
9+ import random
10+
11+ """
12+ 函数说明:从页面读取数据,生成retX和retY列表
13+
14+ Parameters:
15+ retX - 数据X
16+ retY - 数据Y
17+ inFile - HTML文件
18+ yr - 年份
19+ numPce - 乐高部件数目
20+ origPrc - 原价
21+
22+ Returns:
23+ None
24+
25+ Modify:
26+ 2018-07-30
27+ """
28+ def scrapePage (retX , retY , inFile , yr , numPce , origPrc ):
29+ # 打开并读取HTML文件
30+ with open (inFile , encoding = 'utf-8' ) as f :
31+ html = f .read ()
32+ soup = BeautifulSoup (html )
33+ i = 1
34+ # 根据HTML页面结构进行解析
35+ currentRow = soup .find_all ('table' , r = '%d' % i )
36+ while (len (currentRow ) != 0 ):
37+ currentRow = soup .find_all ('table' , r = '%d' % i )
38+ title = currentRow [0 ].find_all ('a' )[1 ].text
39+ lwrTitle = title .lower ()
40+ # 查找是否有全新标签
41+ if (lwrTitle .find ('new' ) > - 1 ) or (lwrTitle .find ('nisb' ) > - 1 ):
42+ newFlag = 1.0
43+ else :
44+ newFlag = 0.0
45+ # 查找是否已经标志出售,我们只收集已出售的数据
46+ soldUnicde = currentRow [0 ].find_all ('td' )[3 ].find_all ('span' )
47+ if len (soldUnicde ) == 0 :
48+ print ("商品#%d没有出售" % i )
49+ else :
50+ # 解析页面获取当前价格
51+ soldPrice = currentRow [0 ].find_all ('td' )[4 ]
52+ priceStr = soldPrice .text
53+ priceStr = priceStr .replace ('$' , '' )
54+ priceStr = priceStr .replace (',' , '' )
55+ if len (soldPrice ) > 1 :
56+ priceStr = priceStr .replace ('Free shipping' , '' )
57+ sellingPrice = float (priceStr )
58+ # 去掉不完整的套装价格
59+ if sellingPrice > origPrc * 0.5 :
60+ print ('%d\t %d\t %d\t %f\t %f' % (yr , numPce , newFlag , origPrc , sellingPrice ))
61+ retX .append ([yr , numPce , newFlag , origPrc ])
62+ retY .append (sellingPrice )
63+ i += 1
64+ currentRow = soup .find_all ('table' , r = '%d' % i )
65+
66+
67+ """
68+ 函数说明:依次读取六种乐高套装的数据,并生成数据矩阵
69+
70+ Parameters:
71+ retX - 数据X
72+ retY - 数据Y
73+
74+ Returns:
75+ None
76+
77+ Modify:
78+ 2018-07-30
79+ """
80+ def setDataCollect (retX , retY ):
81+ # 2006年的乐高8288,部件数目800,原价49.99
82+ scrapePage (retX , retY , './lego/lego8288.html' , 2006 , 800 , 49.99 )
83+ scrapePage (retX , retY , './lego/lego10030.html' , 2002 , 3096 , 269.99 )
84+ scrapePage (retX , retY , './lego/lego10179.html' , 2007 , 5195 , 499.99 )
85+ scrapePage (retX , retY , './lego/lego10181.html' , 2007 , 3428 , 199.99 )
86+ scrapePage (retX , retY , './lego/lego10189.html' , 2008 , 5922 , 299.99 )
87+ scrapePage (retX , retY , './lego/lego10196.html' , 2009 , 3263 , 249.99 )
88+
89+
90+ """
91+ 函数说明:数据标准化
92+
93+ Parameters:
94+ xMat - x数据集
95+ yMat - y数据集
96+
97+ Returns:
98+ inxMat - 标准化后的x数据集
99+ inyMat - 标准化后的y数据集
100+
101+ Modify:
102+ 2018-07-30
103+ """
104+ def regularize (xMat , yMat ):
105+ # 深层拷贝
106+ inxMat = xMat .copy ()
107+ inyMat = yMat .copy ()
108+ # 求yMat的均值
109+ yMean = np .mean (yMat , 0 )
110+ # 计算yMat每一个值与yMean的差值
111+ inyMat = yMat - yMean
112+ # 求inxMat每一列的均值
113+ inMeans = np .mean (inxMat , 0 )
114+ # 求inxMat每一列的方差即(各项-均值的平方求和)后再除以N
115+ inVar = np .var (inxMat , 0 )
116+ print (inMeans )
117+ # 数据减去均值处以方差实现标准化
118+ inxMat = (inxMat - inMeans ) / inVar
119+ return inxMat , inyMat
120+
121+
122+ """
123+ 函数说明:计算平方误差
124+
125+ Parameters:
126+ yArr - 预测值
127+ yHatArr - 真实值
128+
129+ Returns:
130+ 平方误差
131+
132+ Modify:
133+ 2018-07-30
134+ """
135+ def rssError (yArr , yHatArr ):
136+ return ((yArr - yHatArr )** 2 ).sum ()
137+
138+
139+ """
140+ 函数说明:计算回归系数w
141+
142+ Parameters:
143+ xArr - x数据集
144+ yArr - y数据集
145+
146+ Returns:
147+ ws - 回归系数
148+
149+ Modify:
150+ 2018-07-30
151+ """
152+ def standRegres (xArr , yArr ):
153+ xMat = np .mat (xArr )
154+ yMat = np .mat (yArr ).T
155+ xTx = xMat .T * xMat
156+ # 求矩阵的行列式
157+ if np .linalg .det (xTx ) == 0.0 :
158+ print ("矩阵为奇异矩阵,不能求逆" )
159+ return
160+ # .I求逆矩阵
161+ ws = (xTx .I ) * (xMat .T ) * yMat
162+ return ws
163+
164+
165+ """
166+ 函数说明:岭回归
167+
168+ Parameters:
169+ xMat - x数据集
170+ yMat - y数据集
171+ lam - 缩减系数
172+
173+ Returns:
174+ ws - 回归系数
175+
176+ Modify:
177+ 2018-07-30
178+ """
179+ def ridgeRegres (xMat , yMat , lam = 0.2 ):
180+ xTx = xMat .T * xMat
181+ demon = xTx + np .eye (np .shape (xMat )[1 ]) * lam
182+ # 求矩阵的行列式
183+ if np .linalg .det (demon ) == 0.0 :
184+ print ("矩阵为奇异矩阵,不能求逆" )
185+ return
186+ # .I求逆矩阵
187+ ws = (demon .I ) * (xMat .T ) * yMat
188+ return ws
189+
190+
191+ """
192+ 函数说明:岭回归测试
193+
194+ Parameters:
195+ xArr - x数据集
196+ yArr - y数据集
197+
198+ Returns:
199+ wMat - 回归系数矩阵
200+
201+ Modify:
202+ 2018-07-30
203+ """
204+ def ridgeTest (xArr , yArr ):
205+ xMat = np .mat (xArr )
206+ yMat = np .mat (yArr ).T
207+ # 数据标准化
208+ # 行与行操作,求均值
209+ yMean = np .mean (yMat , axis = 0 )
210+ # 数据减去均值
211+ yMat = yMat - yMean
212+ # 行与行操作,求均值
213+ xMeans = np .mean (xMat , axis = 0 )
214+ # 行与行操作,求方差
215+ xVar = np .var (xMat , axis = 0 )
216+ # 数据减去均值除以方差实现标准化
217+ xMat = (xMat - xMeans ) / xVar
218+ # 30个不同的lamda测试
219+ numTestPts = 30
220+ # 初始化回归系数矩阵
221+ wMat = np .zeros ((numTestPts , np .shape (xMat )[1 ]))
222+ # 改变lamda计算回归系数
223+ for i in range (numTestPts ):
224+ # lamda以e的指数变化,最初是一个非常小的数
225+ ws = ridgeRegres (xMat , yMat , np .exp (i - 10 ))
226+ # 计算回归系数矩阵
227+ wMat [i , :] = ws .T
228+ return wMat
229+
230+
231+ """
232+ 函数说明:使用简单的线性回归
233+
234+ Parameters:
235+ None
236+
237+ Returns:
238+ None
239+
240+ Modify:
241+ 2018-07-30
242+ """
243+ def useStandRegres ():
244+ lgX = []
245+ lgY = []
246+ setDataCollect (lgX , lgY )
247+ data_num , features_num = np .shape (lgX )
248+ # 第一列全为1
249+ lgx1 = np .mat (np .ones ((data_num , features_num + 1 )))
250+ lgx1 [:, 1 :5 ] = np .mat (lgX )
251+ # 计算回归系数
252+ ws = standRegres (lgx1 , lgY )
253+ print ("%f%+f*年份%+f*部件数量%+f*是否为全新%+f*原价" % (ws [0 ], ws [1 ], ws [2 ], ws [3 ], ws [4 ]))
254+
255+
256+ """
257+ 函数说明:交叉验证岭回归
258+
259+ Parameters:
260+ xArr - x数据集
261+ yArr - y数据集
262+ numVal - 交叉验证次数
263+
264+ Returns:
265+ wMat - 回归系数矩阵
266+
267+ Modify:
268+ 2018-07-30
269+ """
270+ def crossValidation (xArr , yArr , numVal = 10 ):
271+ m = len (yArr )
272+ indexList = list (range (m ))
273+ errorMat = np .zeros ((numVal , 30 ))
274+ for i in range (numVal ):
275+ trainX = []
276+ trainY = []
277+ testX = []
278+ testY = []
279+ # shuffle() 方法将序列的所有元素随机排序。
280+ random .shuffle (indexList )
281+ for j in range (m ):
282+ # 90%数据训练集
283+ if j < m * 0.9 :
284+ trainX .append (xArr [indexList [j ]])
285+ trainY .append (yArr [indexList [j ]])
286+ # 10%数据测试集
287+ else :
288+ testX .append (xArr [indexList [j ]])
289+ testY .append (yArr [indexList [j ]])
290+ # 岭回归测试
291+ wMat = ridgeTest (trainX , trainY )
292+ for k in range (30 ):
293+ matTestX = np .mat (testX )
294+ matTrainX = np .mat (trainX )
295+ # 标准化
296+ meanTrain = np .mean (matTrainX , 0 )
297+ varTrain = np .var (matTrainX , 0 )
298+ matTestX = (matTestX - meanTrain ) / varTrain
299+ # 数据还原
300+ yEst = matTestX * np .mat (wMat [k , :]).T + np .mean (trainY )
301+ errorMat [i , k ] = rssError (yEst .T .A , np .array (testY ))
302+ meanErrors = np .mean (errorMat , 0 )
303+ minMean = float (min (meanErrors ))
304+ bestWeights = wMat [np .nonzero (meanErrors == minMean )]
305+ # 表转换
306+ xMat = np .mat (xArr )
307+ yMat = np .mat (yArr ).T
308+ meanX = np .mean (xMat , 0 )
309+ varX = np .var (xMat , 0 )
310+ unReg = bestWeights / varX
311+ print ("%f%+f*年份%+f*部件数量%+f*是否为全新%+f*原价" % ((- 1 * np .sum (np .multiply (meanX , unReg )) + np .mean (yMat )), unReg [0 , 0 ], unReg [0 , 1 ], unReg [0 , 2 ], unReg [0 , 3 ]))
312+
313+
314+ if __name__ == '__main__' :
315+ # useStandRegres()
316+ lgX = []
317+ lgY = []
318+ setDataCollect (lgX , lgY )
319+ print (ridgeTest (lgX , lgY ))
320+ crossValidation (lgX , lgY )
321+
0 commit comments