Skip to content

Commit e81cf68

Browse files
authored
Add files via upload
1 parent 91082e6 commit e81cf68

File tree

7 files changed

+639
-0
lines changed

7 files changed

+639
-0
lines changed
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Jul 30 14:17:56 2018
4+
5+
@author: wzy
6+
"""
7+
import numpy as np
8+
from bs4 import BeautifulSoup
9+
import random
10+
11+
"""
12+
函数说明:从页面读取数据,生成retX和retY列表
13+
14+
Parameters:
15+
retX - 数据X
16+
retY - 数据Y
17+
inFile - HTML文件
18+
yr - 年份
19+
numPce - 乐高部件数目
20+
origPrc - 原价
21+
22+
Returns:
23+
None
24+
25+
Modify:
26+
2018-07-30
27+
"""
28+
def scrapePage(retX, retY, inFile, yr, numPce, origPrc):
29+
# 打开并读取HTML文件
30+
with open(inFile, encoding='utf-8') as f:
31+
html = f.read()
32+
soup = BeautifulSoup(html)
33+
i = 1
34+
# 根据HTML页面结构进行解析
35+
currentRow = soup.find_all('table', r='%d' % i)
36+
while(len(currentRow) != 0):
37+
currentRow = soup.find_all('table', r='%d' % i)
38+
title = currentRow[0].find_all('a')[1].text
39+
lwrTitle = title.lower()
40+
# 查找是否有全新标签
41+
if(lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
42+
newFlag = 1.0
43+
else:
44+
newFlag = 0.0
45+
# 查找是否已经标志出售,我们只收集已出售的数据
46+
soldUnicde = currentRow[0].find_all('td')[3].find_all('span')
47+
if len(soldUnicde) == 0:
48+
print("商品#%d没有出售" % i)
49+
else:
50+
# 解析页面获取当前价格
51+
soldPrice = currentRow[0].find_all('td')[4]
52+
priceStr = soldPrice.text
53+
priceStr = priceStr.replace('$', '')
54+
priceStr = priceStr.replace(',', '')
55+
if len(soldPrice) > 1:
56+
priceStr = priceStr.replace('Free shipping', '')
57+
sellingPrice = float(priceStr)
58+
# 去掉不完整的套装价格
59+
if sellingPrice > origPrc * 0.5:
60+
print('%d\t%d\t%d\t%f\t%f' % (yr, numPce, newFlag, origPrc, sellingPrice))
61+
retX.append([yr, numPce, newFlag, origPrc])
62+
retY.append(sellingPrice)
63+
i += 1
64+
currentRow = soup.find_all('table', r='%d' % i)
65+
66+
67+
"""
68+
函数说明:依次读取六种乐高套装的数据,并生成数据矩阵
69+
70+
Parameters:
71+
retX - 数据X
72+
retY - 数据Y
73+
74+
Returns:
75+
None
76+
77+
Modify:
78+
2018-07-30
79+
"""
80+
def setDataCollect(retX, retY):
81+
# 2006年的乐高8288,部件数目800,原价49.99
82+
scrapePage(retX, retY, './lego/lego8288.html', 2006, 800, 49.99)
83+
scrapePage(retX, retY, './lego/lego10030.html', 2002, 3096, 269.99)
84+
scrapePage(retX, retY, './lego/lego10179.html', 2007, 5195, 499.99)
85+
scrapePage(retX, retY, './lego/lego10181.html', 2007, 3428, 199.99)
86+
scrapePage(retX, retY, './lego/lego10189.html', 2008, 5922, 299.99)
87+
scrapePage(retX, retY, './lego/lego10196.html', 2009, 3263, 249.99)
88+
89+
90+
"""
91+
函数说明:数据标准化
92+
93+
Parameters:
94+
xMat - x数据集
95+
yMat - y数据集
96+
97+
Returns:
98+
inxMat - 标准化后的x数据集
99+
inyMat - 标准化后的y数据集
100+
101+
Modify:
102+
2018-07-30
103+
"""
104+
def regularize(xMat, yMat):
105+
# 深层拷贝
106+
inxMat = xMat.copy()
107+
inyMat = yMat.copy()
108+
# 求yMat的均值
109+
yMean = np.mean(yMat, 0)
110+
# 计算yMat每一个值与yMean的差值
111+
inyMat = yMat - yMean
112+
# 求inxMat每一列的均值
113+
inMeans = np.mean(inxMat, 0)
114+
# 求inxMat每一列的方差即(各项-均值的平方求和)后再除以N
115+
inVar = np.var(inxMat, 0)
116+
print(inMeans)
117+
# 数据减去均值处以方差实现标准化
118+
inxMat = (inxMat - inMeans) / inVar
119+
return inxMat, inyMat
120+
121+
122+
"""
123+
函数说明:计算平方误差
124+
125+
Parameters:
126+
yArr - 预测值
127+
yHatArr - 真实值
128+
129+
Returns:
130+
平方误差
131+
132+
Modify:
133+
2018-07-30
134+
"""
135+
def rssError(yArr, yHatArr):
136+
return ((yArr - yHatArr)**2).sum()
137+
138+
139+
"""
140+
函数说明:计算回归系数w
141+
142+
Parameters:
143+
xArr - x数据集
144+
yArr - y数据集
145+
146+
Returns:
147+
ws - 回归系数
148+
149+
Modify:
150+
2018-07-30
151+
"""
152+
def standRegres(xArr, yArr):
153+
xMat = np.mat(xArr)
154+
yMat = np.mat(yArr).T
155+
xTx = xMat.T * xMat
156+
# 求矩阵的行列式
157+
if np.linalg.det(xTx) == 0.0:
158+
print("矩阵为奇异矩阵,不能求逆")
159+
return
160+
# .I求逆矩阵
161+
ws = (xTx.I) * (xMat.T) * yMat
162+
return ws
163+
164+
165+
"""
166+
函数说明:岭回归
167+
168+
Parameters:
169+
xMat - x数据集
170+
yMat - y数据集
171+
lam - 缩减系数
172+
173+
Returns:
174+
ws - 回归系数
175+
176+
Modify:
177+
2018-07-30
178+
"""
179+
def ridgeRegres(xMat, yMat, lam=0.2):
180+
xTx = xMat.T * xMat
181+
demon = xTx + np.eye(np.shape(xMat)[1]) * lam
182+
# 求矩阵的行列式
183+
if np.linalg.det(demon) == 0.0:
184+
print("矩阵为奇异矩阵,不能求逆")
185+
return
186+
# .I求逆矩阵
187+
ws = (demon.I) * (xMat.T) * yMat
188+
return ws
189+
190+
191+
"""
192+
函数说明:岭回归测试
193+
194+
Parameters:
195+
xArr - x数据集
196+
yArr - y数据集
197+
198+
Returns:
199+
wMat - 回归系数矩阵
200+
201+
Modify:
202+
2018-07-30
203+
"""
204+
def ridgeTest(xArr, yArr):
205+
xMat = np.mat(xArr)
206+
yMat = np.mat(yArr).T
207+
# 数据标准化
208+
# 行与行操作,求均值
209+
yMean = np.mean(yMat, axis=0)
210+
# 数据减去均值
211+
yMat = yMat - yMean
212+
# 行与行操作,求均值
213+
xMeans = np.mean(xMat, axis=0)
214+
# 行与行操作,求方差
215+
xVar = np.var(xMat, axis=0)
216+
# 数据减去均值除以方差实现标准化
217+
xMat = (xMat - xMeans) / xVar
218+
# 30个不同的lamda测试
219+
numTestPts = 30
220+
# 初始化回归系数矩阵
221+
wMat = np.zeros((numTestPts, np.shape(xMat)[1]))
222+
# 改变lamda计算回归系数
223+
for i in range(numTestPts):
224+
# lamda以e的指数变化,最初是一个非常小的数
225+
ws = ridgeRegres(xMat, yMat, np.exp(i - 10))
226+
# 计算回归系数矩阵
227+
wMat[i, :] = ws.T
228+
return wMat
229+
230+
231+
"""
232+
函数说明:使用简单的线性回归
233+
234+
Parameters:
235+
None
236+
237+
Returns:
238+
None
239+
240+
Modify:
241+
2018-07-30
242+
"""
243+
def useStandRegres():
244+
lgX = []
245+
lgY = []
246+
setDataCollect(lgX, lgY)
247+
data_num, features_num = np.shape(lgX)
248+
# 第一列全为1
249+
lgx1 = np.mat(np.ones((data_num, features_num+1)))
250+
lgx1[:, 1:5] = np.mat(lgX)
251+
# 计算回归系数
252+
ws = standRegres(lgx1, lgY)
253+
print("%f%+f*年份%+f*部件数量%+f*是否为全新%+f*原价" % (ws[0], ws[1], ws[2], ws[3], ws[4]))
254+
255+
256+
"""
257+
函数说明:交叉验证岭回归
258+
259+
Parameters:
260+
xArr - x数据集
261+
yArr - y数据集
262+
numVal - 交叉验证次数
263+
264+
Returns:
265+
wMat - 回归系数矩阵
266+
267+
Modify:
268+
2018-07-30
269+
"""
270+
def crossValidation(xArr, yArr, numVal=10):
271+
m = len(yArr)
272+
indexList = list(range(m))
273+
errorMat = np.zeros((numVal, 30))
274+
for i in range(numVal):
275+
trainX = []
276+
trainY = []
277+
testX = []
278+
testY = []
279+
# shuffle() 方法将序列的所有元素随机排序。
280+
random.shuffle(indexList)
281+
for j in range(m):
282+
# 90%数据训练集
283+
if j < m * 0.9:
284+
trainX.append(xArr[indexList[j]])
285+
trainY.append(yArr[indexList[j]])
286+
# 10%数据测试集
287+
else:
288+
testX.append(xArr[indexList[j]])
289+
testY.append(yArr[indexList[j]])
290+
# 岭回归测试
291+
wMat = ridgeTest(trainX, trainY)
292+
for k in range(30):
293+
matTestX = np.mat(testX)
294+
matTrainX = np.mat(trainX)
295+
# 标准化
296+
meanTrain = np.mean(matTrainX, 0)
297+
varTrain = np.var(matTrainX, 0)
298+
matTestX = (matTestX - meanTrain) / varTrain
299+
# 数据还原
300+
yEst = matTestX * np.mat(wMat[k, :]).T + np.mean(trainY)
301+
errorMat[i, k] = rssError(yEst.T.A, np.array(testY))
302+
meanErrors = np.mean(errorMat, 0)
303+
minMean = float(min(meanErrors))
304+
bestWeights = wMat[np.nonzero(meanErrors == minMean)]
305+
# 表转换
306+
xMat = np.mat(xArr)
307+
yMat = np.mat(yArr).T
308+
meanX = np.mean(xMat, 0)
309+
varX = np.var(xMat, 0)
310+
unReg = bestWeights / varX
311+
print("%f%+f*年份%+f*部件数量%+f*是否为全新%+f*原价" % ((-1 * np.sum(np.multiply(meanX, unReg)) + np.mean(yMat)), unReg[0, 0], unReg[0, 1], unReg[0, 2], unReg[0, 3]))
312+
313+
314+
if __name__ == '__main__':
315+
# useStandRegres()
316+
lgX = []
317+
lgY = []
318+
setDataCollect(lgX, lgY)
319+
print(ridgeTest(lgX, lgY))
320+
crossValidation(lgX, lgY)
321+

0 commit comments

Comments
 (0)