||
)")
- CharToNextTabRex = re.compile("")
-
- # 将一些html的符号实体转变为原始符号
- replaceTab = [("<","<"),(">",">"),("&","&"),("&","\""),(" "," ")]
-
- def Replace_Char(self,x):
- x = self.BgnCharToNoneRex.sub("",x)
- x = self.BgnPartRex.sub("\n ",x)
- x = self.CharToNewLineRex.sub("\n",x)
- x = self.CharToNextTabRex.sub("\t",x)
- x = self.EndCharToNoneRex.sub("",x)
-
- for t in self.replaceTab:
- x = x.replace(t[0],t[1])
- return x
-
-class Baidu_Spider:
- # 申明相关的属性
- def __init__(self,url):
- self.myUrl = url + '?see_lz=1'
- self.datas = []
- self.myTool = HTML_Tool()
- print u'已经启动百度贴吧爬虫,咔嚓咔嚓'
-
- # 初始化加载页面并将其转码储存
- def baidu_tieba(self):
- # 读取页面的原始信息并将其从gbk转码
- myPage = urllib2.urlopen(self.myUrl).read().decode("gbk")
- # 计算楼主发布内容一共有多少页
- endPage = self.page_counter(myPage)
- # 获取该帖的标题
- title = self.find_title(myPage)
- print u'文章名称:' + title
- # 获取最终的数据
- self.save_data(self.myUrl,title,endPage)
-
- #用来计算一共有多少页
- def page_counter(self,myPage):
- # 匹配 "共有12页" 来获取一共有多少页
- myMatch = re.search(r'class="red">(\d+?)', myPage, re.S)
- if myMatch:
- endPage = int(myMatch.group(1))
- print u'爬虫报告:发现楼主共有%d页的原创内容' % endPage
- else:
- endPage = 0
- print u'爬虫报告:无法计算楼主发布内容有多少页!'
- return endPage
-
- # 用来寻找该帖的标题
- def find_title(self,myPage):
- # 匹配 xxxxxxxxxx 找出标题
- myMatch = re.search(r'(.*?)', myPage, re.S)
- title = u'暂无标题'
- if myMatch:
- title = myMatch.group(1)
- else:
- print u'爬虫报告:无法加载文章标题!'
- # 文件名不能包含以下字符: \ / : * ? " < > |
- title = title.replace('\\','').replace('/','').replace(':','').replace('*','').replace('?','').replace('"','').replace('>','').replace('<','').replace('|','')
- return title
-
-
- # 用来存储楼主发布的内容
- def save_data(self,url,title,endPage):
- # 加载页面数据到数组中
- self.get_data(url,endPage)
- # 打开本地文件
- f = open(title+'.txt','w+')
- f.writelines(self.datas)
- f.close()
- print u'爬虫报告:文件已下载到本地并打包成txt文件'
- print u'请按任意键退出...'
- raw_input();
-
- # 获取页面源码并将其存储到数组中
- def get_data(self,url,endPage):
- url = url + '&pn='
- for i in range(1,endPage+1):
- print u'爬虫报告:爬虫%d号正在加载中...' % i
- myPage = urllib2.urlopen(url + str(i)).read()
- # 将myPage中的html代码处理并存储到datas里面
- self.deal_data(myPage.decode('gbk'))
-
-
- # 将内容从页面代码中抠出来
- def deal_data(self,myPage):
- myItems = re.findall('id="post_content.*?>(.*?)',myPage,re.S)
- for item in myItems:
- data = self.myTool.Replace_Char(item.replace("\n","").encode('gbk'))
- self.datas.append(data+'\n')
-
-
-
-#-------- 程序入口处 ------------------
-print u"""#---------------------------------------
-# 程序:百度贴吧爬虫
-# 版本:0.5
-# 作者:why
-# 日期:2013-05-16
-# 语言:Python 2.7
-# 操作:输入网址后自动只看楼主并保存到本地文件
-# 功能:将楼主发布的内容打包txt存储到本地。
-#---------------------------------------
-"""
-
-# 以某小说贴吧为例子
-# bdurl = 'http://tieba.baidu.com/p/2296712428?see_lz=1&pn=1'
-
-print u'请输入贴吧的地址最后的数字串:'
-bdurl = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/'))
-
-#调用
-mySpider = Baidu_Spider(bdurl)
-mySpider.baidu_tieba()
\ No newline at end of file
diff --git a/Mathematical-Modeling-2014/Project/cloud_large.png b/Mathematical-Modeling-2014/Project/cloud_large.png
deleted file mode 100644
index f8b17b9..0000000
Binary files a/Mathematical-Modeling-2014/Project/cloud_large.png and /dev/null differ
diff --git a/Mathematical-Modeling-2014/Project/myTest/TTT.txt b/Mathematical-Modeling-2014/Project/myTest/TTT.txt
deleted file mode 100644
index 5fe7392..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/TTT.txt
+++ /dev/null
@@ -1,105 +0,0 @@
- SA( 1, 1) 0.000000
- SA( 1, 2) 4.000000
- SA( 1, 3) 0.000000
- SA( 1, 4) 0.000000
- SA( 1, 5) 0.000000
- SA( 2, 1) 0.000000
- SA( 2, 2) 4.000000
- SA( 2, 3) 0.000000
- SA( 2, 4) 0.000000
- SA( 2, 5) 0.000000
- SA( 3, 1) 4.000000
- SA( 3, 2) 0.000000
- SA( 3, 3) 0.000000
- SA( 3, 4) 0.000000
- SA( 3, 5) 0.000000
- SA( 4, 1) 0.000000
- SA( 4, 2) 0.000000
- SA( 4, 3) 0.000000
- SA( 4, 4) 4.000000
- SA( 4, 5) 0.000000
- SA( 5, 1) 0.000000
- SA( 5, 2) 4.000000
- SA( 5, 3) 0.000000
- SA( 5, 4) 0.000000
- SA( 5, 5) 0.000000
- SA( 6, 1) 0.000000
- SA( 6, 2) 0.000000
- SA( 6, 3) 0.000000
- SA( 6, 4) 3.000000
- SA( 6, 5) 0.000000
- SA( 7, 1) 4.000000
- SA( 7, 2) 0.000000
- SA( 7, 3) 0.000000
- SA( 7, 4) 0.000000
- SA( 7, 5) 0.000000
- SA( 8, 1) 0.000000
- SA( 8, 2) 0.000000
- SA( 8, 3) 0.000000
- SA( 8, 4) 4.000000
- SA( 8, 5) 0.000000
- SA( 9, 1) 0.000000
- SA( 9, 2) 4.000000
- SA( 9, 3) 0.000000
- SA( 9, 4) 0.000000
- SA( 9, 5) 0.000000
- SA( 10, 1) 0.000000
- SA( 10, 2) 4.000000
- SA( 10, 3) 0.000000
- SA( 10, 4) 0.000000
- SA( 10, 5) 0.000000
- SA( 11, 1) 0.000000
- SA( 11, 2) 0.000000
- SA( 11, 3) 0.000000
- SA( 11, 4) 4.000000
- SA( 11, 5) 0.000000
- SA( 12, 1) 0.000000
- SA( 12, 2) 4.000000
- SA( 12, 3) 0.000000
- SA( 12, 4) 0.000000
- SA( 12, 5) 0.000000
- SA( 13, 1) 0.000000
- SA( 13, 2) 0.000000
- SA( 13, 3) 0.000000
- SA( 13, 4) 4.000000
- SA( 13, 5) 0.000000
- SA( 14, 1) 4.000000
- SA( 14, 2) 0.000000
- SA( 14, 3) 0.000000
- SA( 14, 4) 0.000000
- SA( 14, 5) 0.000000
- SA( 15, 1) 0.000000
- SA( 15, 2) 0.000000
- SA( 15, 3) 0.000000
- SA( 15, 4) 4.000000
- SA( 15, 5) 0.000000
- SA( 16, 1) 0.000000
- SA( 16, 2) 0.000000
- SA( 16, 3) 4.000000
- SA( 16, 4) 0.000000
- SA( 16, 5) 0.000000
- SA( 17, 1) 0.000000
- SA( 17, 2) 0.000000
- SA( 17, 3) 4.000000
- SA( 17, 4) 0.000000
- SA( 17, 5) 0.000000
- SA( 18, 1) 0.000000
- SA( 18, 2) 0.000000
- SA( 18, 3) 4.000000
- SA( 18, 4) 0.000000
- SA( 18, 5) 0.000000
- SA( 19, 1) 0.000000
- SA( 19, 2) 0.000000
- SA( 19, 3) 4.000000
- SA( 19, 4) 0.000000
- SA( 19, 5) 0.000000
- SA( 20, 1) 0.000000
- SA( 20, 2) 0.000000
- SA( 20, 3) 4.000000
- SA( 20, 4) 0.000000
- SA( 20, 5) 0.000000
- SA( 21, 1) 0.000000
- SA( 21, 2) 0.000000
- SA( 21, 3) 4.000000
- SA( 21, 4) 0.000000
- SA( 21, 5) 0.000000
\ No newline at end of file
diff --git a/Mathematical-Modeling-2014/Project/myTest/ansj_dict.py b/Mathematical-Modeling-2014/Project/myTest/ansj_dict.py
deleted file mode 100644
index db1e280..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/ansj_dict.py
+++ /dev/null
@@ -1,149 +0,0 @@
-#coding:utf-8
-
-path = "C:\\Users\\Syndrome\\Desktop\\语料数据\\ansj词典\\".decode('utf8').encode('cp936')
-new_path = path + "81W_dict.txt"
-
-#################################################词典读取
-myFile = open(new_path,"r")
-
-word_81 = []
-word_length = []
-
-line = myFile.readline()
-
-i = 1
-while line:
- line = line.rstrip('\n')
- # print line
- word_81.append(line)
- word_length.append(len(line)/3)
- line = myFile.readline()
- i += 1
-
-max_len = max(word_length)
-print "the num of word is " + str(i)
-print "the max of length is " + str(max_len)
-print "part1"
-
-myFile.close()
-
-#################################################词典按长度储存
-
-newPath = path + "ansj_simple.txt"
-
-myFile = open(newPath , 'w')
-
-for i in range(50,-1,-1): #for循环的书写
- for j in range(0,len(word_length)):
- if word_length[j] == i:
- newLine = word_81[j] + "\n"
- myFile.writelines(newLine)
-
-myFile.close()
-
-
-print "part2"
-
-##############################################################词典词语长度坐标文件
-new_word_length = sorted(word_length)
-new_len = [811639]
-
-j = 0
-for i in range(0,12):
- while j < len(new_word_length):
- if new_word_length[j] == i:
- pass
- else:
- new_len.append(811639-j)
- break
- j += 1
-new_len.append(811639-j)
-
-newPath = path + "ansj_word_num.txt"
-
-myFile = open(newPath , 'w')
-
-print len(new_len)
-print new_len
-for i in range(0,len(new_len)):
- myFile.writelines(str(new_len[i]) + '\n')
-
-myFile.close()
-
-print "part3"
-
-#################################################分词
-
-word = []
-
-myFile = open(path + "ansj_simple.txt" , 'r')
-line = myFile.readline().rstrip('\n')
-i = 0
-while line:
- word.append(line)
- line = myFile.readline().rstrip('\n')
-myFile.close()
-print "dictionary is ready!"
-
-word_num = new_len
-print "the position of word is ready!"
-
-
-TEST = "一位朴实美丽的渔家姑娘从红树林边的渔村闯入都市,经历了情感的波折和撞击演绎出复杂而\
-又多变的人生。故事发生在有着大面积红树林的小渔村和南海海滨一座新兴的小城里。渔家姑娘珍珠进\
-城打工,珍珠公司总经理大虎对她一见钟情,珍珠却不为所动。大虎企图强占珍珠,珍珠毅然回到红树\
-林。大虎在另两个干部子弟二虎和三虎的挑唆下,轮奸了珍珠。珍珠的意中人大同进行报复,欲杀大虎\
-的母亲、副市长林岚,却刺伤了检查官马叔。大虎又与二虎、三虎轮奸了女工小云,被当场抓获。林岚\
-救子心切,落入了刑侦科长金大川手里。马叔与牛晋顶住压力,使案件终于重审,三个虎被绳之以法。"
-
-new_sent = []
-T_len = len(TEST)/3
-
-if T_len < 10:
- s = T_len
-else:
- s = 9
-
-while s > 0:
- flag = 0
- # print word_num[s]-1
- # print word_num[s+1]
- for i in range(word_num[s]-2,word_num[s+1]-1,-1):
- # print i
- if TEST[0:s*3] == word[i]:
- new_sent.append(word[i])
- print word[i] + "ZZZZZZZZZ"
- flag = 1
- break
- if flag == 1:
- TEST = TEST[s*3:]
- if len(TEST)/3 < 10:
- s = len(TEST)/3
- else:
- s = 9
- else:
- s -= 1
- if s == 1:
- new_sent.append(TEST[:s*3])
- print "TTTTT" + TEST[:s*3] + " " + str(s)
- TEST = TEST[s*3:]
- if len(TEST)/3 < 10:
- s = len(TEST)/3
- else:
- s = 9
-
-for item in new_sent:
- print item + "\\",
-
-print "\npart4"
-
-
-
-
-
-
-
-
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/myTest/get_word_length.py b/Mathematical-Modeling-2014/Project/myTest/get_word_length.py
deleted file mode 100644
index 1ff701c..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/get_word_length.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#coding:utf-8
-
-
-##############################################################词典文件读取
-#中文路径的处理
-path = "C:\\Users\\Syndrome\\Desktop\\语料数据\\360W_字典\\".decode('utf8').encode('cp936')
-
-myFile = open(path + "dict_360.txt","r")
-
-word_length = []
-word_line = []
-
-line = myFile.readline()
-i = 0
-while line:
- word_line.append(line)
- line = line.rstrip('\n') #去掉换行符
- m = line.split('\t') #以\t为分隔符
- #word_length[i] = len(m[0])/3
- word_length.append(len(m[0])/3)
- i += 1
- line = myFile.readline()
- # if i >= 1000:
- # break
-myFile.close()
-
-print "finish"
-print "max of the length of word is " + str(max(word_length))
-print len(word_length)
-print len(word_line)
-
-#写文件
-##############################################################词典文件增加词语长度后,基于长度排序再保存
-newPath = path + "dictionary.txt"
-myFile = open(newPath , 'w')
-
-for i in range(50,-1,-1): #for循环的书写
- for j in range(0,len(word_length)):
- if word_length[j] == i:
- newLine = str(i) + '\t' + word_line[j]
- myFile.writelines(newLine)
-
-myFile.close()
-
-
-##############################################################简化词典文件,基于长度排序的保存
-newPath = path + "dictionary_simple.txt"
-
-myFile = open(newPath , 'w')
-
-for i in range(50,-1,-1): #for循环的书写
- for j in range(0,len(word_length)):
- if word_length[j] == i:
- m = word_line[j].split('\t')
- newLine = m[0] + "\n"
- myFile.writelines(newLine)
-
-myFile.close()
-
-##############################################################词典词语长度坐标文件
-new_word_length = sorted(word_length)
-new_len = [0]
-
-j = 0
-for i in range(0,50):
- while j < len(new_word_length):
- if new_word_length[j] == i:
- pass
- else:
- new_len.append(3669216-j)
- break
- j += 1
-new_len.append(3669216-j)
-
-newPath = path + "word_num.txt"
-
-myFile = open(newPath , 'w')
-
-print len(new_len)
-print new_len
-for i in range(0,len(new_len)):
- myFile.writelines(str(new_len[i]) + '\n')
-
-myFile.close()
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/myTest/math1.py b/Mathematical-Modeling-2014/Project/myTest/math1.py
deleted file mode 100644
index 6d008ee..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/math1.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#coding:utf-8
-
-
-myFile = open("TTT.txt")
-
-line = myFile.readline()
-
-print line
-
-
-
-
-
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/myTest/nltk_test.py b/Mathematical-Modeling-2014/Project/myTest/nltk_test.py
deleted file mode 100644
index a09d31a..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/nltk_test.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#coding:utf-8
-
-word_num = [0,0,1,2,2,3,3,3,3,3,3,3,3,3,4]
-word = ["你是","我今生","唯一的挚爱","你是我今生唯一的挚爱啊啊啊啊"]
-
-TEST = "他说你是我今生唯一的挚爱"
-
-T_len = len(TEST)/3
-print T_len
-s = T_len
-
-while s > 0:
- flag = 0
- print TEST[0:s*3]
- for i in range(word_num[s]-1,word_num[s+1]):
- print word[i]+"sss"
- if TEST[0:s*3] == word[i]:
- print word[i] + "XXXXXX"
- flag = 1
- if flag == 1:
- TEST = TEST[s*3:]
- s = len(TEST)/3
- else:
- s -= 1
- if s == 1:
- print TEST[:s*3] + "ZZZZZZZ"
- TEST = TEST[s*3:]
- s = len(TEST)/3
-
-
-import random
-def guess(player):
- declare = 'You enter number not between 1 and 99!'
- number = int(raw_input('Player %s - Enter a number between 1 and 99:' % player))
- if number < 1:
- print declare
- elif number > 99:
- print declare
- else:
- pass
- return number
-
-def game():
- i = 1
- count = [0,0,0]
- falg = True
- rambom_num = random.randrange(1,99)
- while falg:
- for player in range(0,3):
- number = guess(player + 1)
- count[player] = i
- if number > rambom_num:
- print 'Your guess is too high!'
- elif number < rambom_num:
- print 'Your guess is too low!'
- else:
- print '--------------------------------------'
- print 'Your made the right guess!'
- print 'The secret number is %s' % number
- for p in range(0,len(count)):
- print 'Player %s - Total number of guesses: %s' % (p + 1,count[p])
- falg = False
- break
- i = i + 1
-
-game()
-
\ No newline at end of file
diff --git a/Mathematical-Modeling-2014/Project/myTest/pachong_test.py b/Mathematical-Modeling-2014/Project/myTest/pachong_test.py
deleted file mode 100644
index 568428d..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/pachong_test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-
-import urllib2
-url='http://www.baidu.com/s?wd=cloga'
-content=urllib2.urlopen(url).read()
-
-
-import re
-urls_pat=re.compile(r'(.*?)')
-siteUrls=re.findall(urls_pat,content)
-
-print siteUrls
-
-strip_tag_pat = re.compile(r'<.*?>')
-
-rank = 0
-file=open('result.txt','w')
-for i in siteUrls:
- i0=re.sub(strip_tag_pat,'',i)
- i0=i0.strip()
- i1=i0.split(' ')
- date=i1[-1]
- siteUrl=''.join(i1[:-1])
- rank+=1
- file.write(date+','+siteUrl+','+str(rank)+'\n')
-file.close()
-
-
-
-
-
-
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/myTest/result.txt b/Mathematical-Modeling-2014/Project/myTest/result.txt
deleted file mode 100644
index c9869ec..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/result.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-cloga.info/ 2014-07-26 ,,2
-github.com/cloga 2012-01-10 ,,3
-www.douban.com/people/... 2013-05-12 ,,4
-cn.linkedin.com/in/clo... 2013-01-28 ,,5
-www.weibo.com/cloga 2014-07-31 ,,6
-www.tianya.cn/12911163 2012-01-20 ,,7
-cn.linkedin.com/in/cloga 2011-09-01 ,,8
-space.chinaz.com/Cloga 2014-05-29 ,,9
-i.youku.com/u/UODM5OTU... 2013-01-27 ,,10
diff --git a/Mathematical-Modeling-2014/Project/myTest/split_sentence.py b/Mathematical-Modeling-2014/Project/myTest/split_sentence.py
deleted file mode 100644
index 1750d7d..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/split_sentence.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#coding:utf-8
-
-path = "C:\\Users\\Syndrome\\Desktop\\语料数据\\360W_字典\\".decode('utf8').encode('cp936')
-
-newPath = path + "dictionary_simple.txt"
-
-word = []
-word_num = []
-
-####################################################################最简字典文件打开并进入内存
-myFile = open(newPath , 'r')
-
-line = myFile.readline().rstrip('\n')
-i = 0
-while line:
- word.append(line)
- line = myFile.readline().rstrip('\n')
- # if i == 2000:
- # print word[i]
- # i=i+1
-
-myFile.close()
-print len(word)
-print "part1"
-
-####################################################################词典词语长度坐标文件进去内存
-newPath2 = path + "word_num.txt"
-myFile = open(newPath2 , 'r')
-
-line = myFile.readline().rstrip('\n')
-
-while line:
- word_num.append(int(line))
- line = myFile.readline().rstrip('\n')
-
-myFile.close()
-
-print len(word_num)
-
-print "part2"
-####################################################################利用词典进行分词
-
-TEST = "你是我一生的挚爱啊我的女神"
-
-TEST = "一位朴实美丽的渔家姑娘从红树林边的渔村闯入都市,经历了情感的波折和撞击演绎出复杂而\
-又多变的人生。故事发生在有着大面积红树林的小渔村和南海海滨一座新兴的小城里。渔家姑娘珍珠进\
-城打工,珍珠公司总经理大虎对她一见钟情,珍珠却不为所动。大虎企图强占珍珠,珍珠毅然回到红树\
-林。大虎在另两个干部子弟二虎和三虎的挑唆下,轮奸了珍珠。珍珠的意中人大同进行报复,欲杀大虎\
-的母亲、副市长林岚,却刺伤了检查官马叔。大虎又与二虎、三虎轮奸了女工小云,被当场抓获。林岚\
-救子心切,落入了刑侦科长金大川手里。马叔与牛晋顶住压力,使案件终于重审,三个虎被绳之以法。"
-
-
-new_sent = []
-
-T_len = len(TEST)/3
-
-if T_len < 41:
- s = T_len
-else:
- s = 40
-
-while s > 0:
- flag = 0
- # print word_num[s]-1
- # print word_num[s+1]
- # print s
- # print TEST[0:s*3]
- for i in range(word_num[s]-1,word_num[s+1],-1):
- #print word[i]
- if TEST[0:s*3] == word[i]:
- new_sent.append(word[i])
- print word[i] + "ZZZZZZZZZ"
- flag = 1
- break
- if flag == 1:
- TEST = TEST[s*3:]
- if len(TEST)/3 < 41:
- s = len(TEST)/3
- else:
- s = 40
- else:
- s -= 1
- if s == 1:
- new_sent.append(TEST[:s*3])
- print "TTTTT" + TEST[:s*3] + " " + str(s)
- TEST = TEST[s*3:]
- if len(TEST)/3 < 41:
- s = len(TEST)/3
- else:
- s = 40
-
-
-for item in new_sent:
- print item + "\\",
-
-
-print "\npart3"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/myTest/test.py b/Mathematical-Modeling-2014/Project/myTest/test.py
deleted file mode 100644
index 338a497..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/test.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#coding:utf-8
-
-import os
-
-path = "C:\\Users\\Syndrome\\Desktop\\语料数据\\文本分类\\20_newsgroups\\".decode("utf-8").encode("cp936")
-
-filenamelist=os.listdir(path)
-for item in filenamelist :
- print item
- filenamelist2 = os.listdir(path + "\\" + item)
- for item2 in filenamelist2 :
- print item2
- newPath = path + "\\" + item +"\\" + item2
- myFile = open (newPath)
-
- myFile.close()
-
-print "finish!"
-
-
-
-
-# myFile = open(path)
-
-# line = myFile.readline()
-
-# while line :
-# print line
-# line = myFile.readline()
-
-# myFile.close()
-
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/myTest/test2.py b/Mathematical-Modeling-2014/Project/myTest/test2.py
deleted file mode 100644
index 228eb48..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/test2.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#coding: utf-8
-
-
-###多线程 Multithreading
-
-import threading
-TOTAL = 0
-MY_LOCK = threading.Lock()
-class CountThread(threading.Thread):
- def run(self):
- global TOTAL
- for i in range(100):
- MY_LOCK.acquire()
- TOTAL = TOTAL + 1
- MY_LOCK.release()
- print('%s\n' % (TOTAL))
-a = CountThread()
-b = CountThread()
-a.start()
-b.start()
-
-
-
-text1 = ["你是","我今生","唯一的挚爱","你是我今生唯一的挚爱啊啊啊啊"]
-print text1.count("你是")+1
-
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/myTest/test_dict_360.py b/Mathematical-Modeling-2014/Project/myTest/test_dict_360.py
deleted file mode 100644
index 13d6c74..0000000
--- a/Mathematical-Modeling-2014/Project/myTest/test_dict_360.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#coding:utf-8
-
-
-path = "C:\\Users\\Syndrome\\Desktop\\语料数据\\360W_字典\\dict_360.txt".decode('utf8').encode('cp936')
-
-f = open(path,"r")
-
-line = f.readline()
-i = 0
-while line:
- line = line.rstrip('\n') #去除字符\n
- m = line.split('\t') #字符串分割,以\t
-
- print len(m[0])/3
-
- for item in m:
- print item # 后面跟 ',' 将忽略换行符
- # print(line, end = '') # 在 Python 3中使用
-
- line = f.readline()
- i += 1
- if i == 1000:
- break
-
-f.close()
-
-
-
-# 注释代码快捷键,ctrl+/
-# def str_len(str):
-# try:
-# row_l=len(str)
-# utf8_l=len(str.encode('utf-8'))
-# return (utf8_l-row_l)/2+row_l
-# except:
-# return None
-# return None
-
diff --git a/Mathematical-Modeling-2014/Project/qiubai_spider.py b/Mathematical-Modeling-2014/Project/qiubai_spider.py
deleted file mode 100644
index b88fb52..0000000
--- a/Mathematical-Modeling-2014/Project/qiubai_spider.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# -*- coding: utf-8 -*-
-#---------------------------------------
-# 程序:糗百爬虫
-# 版本:0.2
-# 作者:why
-# 日期:2013-05-15
-# 语言:Python 2.7
-# 操作:输入quit退出阅读糗事百科
-# 功能:按下回车依次浏览今日的糗百热点
-# 更新:解决了命令提示行下乱码的问题
-#---------------------------------------
-
-import urllib2
-import urllib
-import re
-import thread
-import time
-
-# import sys
-# reload(sys)
-# sys.setdefaultencoding('utf-8')
-
-#----------- 处理页面上的各种标签 -----------
-class HTML_Tool:
- # 用非 贪婪模式 匹配 \t 或者 \n 或者 空格 或者 超链接 或者 图片
- BgnCharToNoneRex = re.compile("(\t|\n| ||)")
-
- # 用非 贪婪模式 匹配 任意<>标签
- EndCharToNoneRex = re.compile("<.*?>")
-
- # 用非 贪婪模式 匹配 任意标签
- BgnPartRex = re.compile(" ")
- CharToNewLineRex = re.compile("( |||| )")
- CharToNextTabRex = re.compile("")
-
- # 将一些html的符号实体转变为原始符号
- replaceTab = [("<","<"),(">",">"),("&","&"),("&","\""),(" "," ")]
-
- def Replace_Char(self,x):
- x = self.BgnCharToNoneRex.sub("",x)
- x = self.BgnPartRex.sub("\n ",x)
- x = self.CharToNewLineRex.sub("\n",x)
- x = self.CharToNextTabRex.sub("\t",x)
- x = self.EndCharToNoneRex.sub("",x)
-
- for t in self.replaceTab:
- x = x.replace(t[0],t[1])
- return x
-#----------- 处理页面上的各种标签 -----------
-
-
-#----------- 加载处理糗事百科 -----------
-class HTML_Model:
-
- def __init__(self):
- self.page = 1
- self.pages = []
- self.myTool = HTML_Tool()
- self.enable = False
-
- # 将所有的段子都扣出来,添加到列表中并且返回列表
- def GetPage(self,page):
- myUrl = "http://m.qiushibaike.com/hot/page/" + page
- myResponse = urllib2.urlopen(myUrl)
- myPage = myResponse.read()
- #encode的作用是将unicode编码转换成其他编码的字符串
- #decode的作用是将其他编码的字符串转换成unicode编码
- unicodePage = myPage.decode("utf-8")
-
- # 找出所有class="content"的div标记
- #re.S是任意匹配模式,也就是.可以匹配换行符
- myItems = re.findall('(.*?)',unicodePage,re.S)
- items = []
- for item in myItems:
- # item 中第一个是div的标题,也就是时间
- # item 中第二个是div的内容,也就是内容
- items.append([item[0].replace("\n",""),item[1].replace("\n","")])
- return items
-
- # 用于加载新的段子
- def LoadPage(self):
- # 如果用户未输入quit则一直运行
- while self.enable:
- # 如果pages数组中的内容小于2个
- if len(self.pages) < 2:
- try:
- # 获取新的页面中的段子们
- myPage = self.GetPage(str(self.page))
- self.page += 1
- self.pages.append(myPage)
- except:
- print '无法链接糗事百科!'
- else:
- time.sleep(1)
-
- def ShowPage(self,q,page):
- for items in q:
- print u'第%d页' % page , items[0]
- print self.myTool.Replace_Char(items[1])
- myInput = raw_input()
- if myInput == "quit":
- self.enable = False
- break
-
- def Start(self):
- self.enable = True
- page = self.page
-
- print u'正在加载中请稍候......'
-
- # 新建一个线程在后台加载段子并存储
- thread.start_new_thread(self.LoadPage,())
-
- #----------- 加载处理糗事百科 -----------
- while self.enable:
- # 如果self的page数组中存有元素
- if self.pages:
- nowPage = self.pages[0]
- del self.pages[0]
- self.ShowPage(nowPage,page)
- page += 1
-
-
-#----------- 程序的入口处 -----------
-print u"""
----------------------------------------
- 程序:糗百爬虫
- 版本:0.1
- 作者:why
- 日期:2013-05-15
- 语言:Python 2.7
- 操作:输入quit退出阅读糗事百科
- 功能:按下回车依次浏览今日的糗百热点
----------------------------------------
-"""
-
-
-print u'请按下回车浏览今日的糗百内容:'
-raw_input(' ')
-myModel = HTML_Model()
-myModel.Start()
\ No newline at end of file
diff --git a/Mathematical-Modeling-2014/Project/snownlp_test.py b/Mathematical-Modeling-2014/Project/snownlp_test.py
deleted file mode 100644
index d4eb9f6..0000000
--- a/Mathematical-Modeling-2014/Project/snownlp_test.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#coding:utf-8
-# import sys
-# reload(sys)
-# sys.setdefaultencoding( "utf-8" )
-
-
-from snownlp import SnowNLP
-
-str1 = u'这个东西真心很赞'
-s = SnowNLP(str1)
-
-#print str1
-print str1.encode('utf-8')
-
-sw=s.words
-print sw
-#print sw.encode('utf-8') # [u'这个', u'东西', u'真心',
- # u'很', u'赞']
-
-print s.tags # [(u'这个', u'r'), (u'东西', u'n'),
- # (u'真心', u'd'), (u'很', u'd'),
- # (u'赞', u'Vg')]
-
-print s.sentiments # 0.9830157237610916 positive的概率
-
-print s.pinyin # [u'zhe', u'ge', u'dong', u'xi',
- # u'zhen', u'xin', u'hen', u'zan']
-
-
-s = SnowNLP(u'「繁體字」「繁體中文」的叫法在臺灣亦很常見。')
-
-s.han # u'「繁体字」「繁体中文」的叫法
- # 在台湾亦很常见。'
-
-text = u'''
-自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。
-它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。
-自然语言处理是一门融语言学、计算机科学、数学于一体的科学。
-因此,这一领域的研究将涉及自然语言,即人们日常使用的语言,
-所以它与语言学的研究有着密切的联系,但又有重要的区别。
-自然语言处理并不是一般地研究自然语言,
-而在于研制能有效地实现自然语言通信的计算机系统,
-特别是其中的软件系统。因而它是计算机科学的一部分。
-'''
-
-s = SnowNLP(text)
-
-s.keywords(3) # [u'语言', u'自然', u'计算机']
-
-s.summary(3) # [u'自然语言处理是一门融语言学、计算机科学、
- # 数学于一体的科学',
- # u'即人们日常使用的语言',
- # u'自然语言处理是计算机科学领域与人工智能
- # 领域中的一个重要方向']
-s.sentences
-
-s = SnowNLP([[u'这篇', u'文章'],
- [u'那篇', u'论文'],
- [u'这个']])
-s.tf
-s.idf
-s.sim([u'文章'])# [0.3756070762985226, 0, 0]
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/spider.py b/Mathematical-Modeling-2014/Project/spider.py
deleted file mode 100644
index 2854f04..0000000
--- a/Mathematical-Modeling-2014/Project/spider.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# coding=utf-8
-
-#---------------------------------------
-# 程序:百度贴吧爬虫
-# 版本:0.1
-# 作者:why
-# 日期:2013-05-14
-# 语言:Python 2.7
-# 操作:输入带分页的地址,去掉最后面的数字,设置一下起始页数和终点页数。
-# 功能:下载对应页码内的所有页面并存储为html文件。
-#---------------------------------------
-
-import string, urllib
-
-#定义百度函数
-def baidu_tieba(url,begin_page,end_page):
- for i in range(begin_page, end_page+1):
- sName = string.zfill(i,5) + '.txt'#自动填充成六位的文件名
- print '正在下载第' + str(i) + '个网页,并将其存储为' + sName + '......'
- f = open(sName,'w+')
- m = urllib.urlopen(url + str(i)).read()
-
- #print m
-
- f.write(m)
- f.close()
-
-
-#-------- 在这里输入参数 ------------------
-
-# 这个是山东大学的百度贴吧中某一个帖子的地址
-#bdurl = 'http://tieba.baidu.com/p/2296017831?pn='
-#iPostBegin = 1
-#iPostEnd = 10
-
-#bdurl = str(raw_input(u'请输入贴吧的地址,去掉pn=后面的数字:\n'))
-bdurl = 'http://tieba.baidu.com/p/2296017831?pn='
-#begin_page = int(raw_input(u'请输入开始的页数:\n'))
-#end_page = int(raw_input(u'请输入终点的页数:\n'))
-begin_page = 1
-end_page = 5
-#-------- 在这里输入参数 ------------------
-
-
-#调用
-baidu_tieba(bdurl,begin_page,end_page)
-
-response = urllib.urlopen('http://www.baidu.com/')
-html = response.read()
-print html
diff --git a/Mathematical-Modeling-2014/Project/test1.py b/Mathematical-Modeling-2014/Project/test1.py
deleted file mode 100644
index 2d2f883..0000000
--- a/Mathematical-Modeling-2014/Project/test1.py
+++ /dev/null
@@ -1,28 +0,0 @@
-
-import re
-import urllib
-
-
-def getHtml(url):
- page = urllib.urlopen(url)
- html = page.read()
- return html
-
-def getImg(html):
- reg = r"src='+(.*?\.jpg)+' width"
- imgre = re.compile(reg)
- imgList = re.findall(imgre,html)
- x = 0
- for imgurl in imgList:
- print imgurl
- #urllib.urlretrieve(imgurl,'%s.jpg' % x)
- x+=1
-
-
-#a = raw_input()
-
-html = getHtml("http://tieba.baidu.com/p/2844418574?pn=2")
-getImg(html)
-
-
-
diff --git a/Mathematical-Modeling-2014/Project/test_test.py b/Mathematical-Modeling-2014/Project/test_test.py
deleted file mode 100644
index 13d6f2c..0000000
--- a/Mathematical-Modeling-2014/Project/test_test.py
+++ /dev/null
@@ -1,6 +0,0 @@
-#coding:utf-8
-s=u"中文"
-b=u"我"
-print b.encode("gb2312")
-print s.encode("gb2312")
-
diff --git a/Mathematical-Modeling-2014/Project/wordcloud.py b/Mathematical-Modeling-2014/Project/wordcloud.py
deleted file mode 100644
index f06fd07..0000000
--- a/Mathematical-Modeling-2014/Project/wordcloud.py
+++ /dev/null
@@ -1,12 +0,0 @@
-#test of pytagcloud
-
-from pytagcloud import create_tag_image, make_tags
-from pytagcloud.lang.counter import get_tag_counts
-
-YOUR_TEXT = "A tag cloud is a visual representation for text data, typically\
-used to depict keyword metadata on websites, or to visualize free form text."
-
-tags = make_tags(get_tag_counts(YOUR_TEXT), maxsize=120)
-
-create_tag_image(tags, 'cloud_large.png', size=(900, 600), fontname='Lobster')
-
diff --git a/Mathematical-Modeling-2014/car.txt b/Mathematical-Modeling-2014/car.txt
deleted file mode 100644
index bb9fd8a..0000000
--- a/Mathematical-Modeling-2014/car.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-4490 1780
-4466 1705
-4531 1817
-4670 1780
-4747 1820
-4500 1755
-4880 1800
-4865 1805
-4687 1700
-4544 1760
-4608 1743
-4350 1735
-4400 1695
-4789 1765
-5015 1880
-4600 1800
-4930 1795
-4945 1845
-4603 1780
-4855 1780
-5035 1855
-4480 1840
-4580 1725
-4420 1690
-6831 1980
-3745 1615
-4194 1680
-3763 1615
-3460 1618
-4310 1695
-4270 1695
-4245 1680
-4212 1762
-3588 1563
-3998 1640
-4230 1690
-4135 1755
diff --git a/Mathematical-Modeling-2014/car45.txt b/Mathematical-Modeling-2014/car45.txt
deleted file mode 100644
index 8b77d39..0000000
--- a/Mathematical-Modeling-2014/car45.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-4610 1826 1763 4 2 0 3 1
-5015 1880 1475 2 3 0 4 2
-4310 1695 1480 12 6 5 10 7
-4747 1820 1440 15 8 4 9 6
-3460 1618 1465 12 8 7 21 6
-4490 1780 1405 10 12 14 9 13
-4230 1690 1550 7 0 2 5 7
-4270 1695 1480 5 3 12 5 4
-4480 1840 1500 4 0 6 8 5
-4135 1755 1605 6 0 0 3 2
-4600 1800 1475 12 3 5 0 0
-4574 1704 1845 6 4 2 0 0
-4500 1755 1450 15 9 5 7 6
-4420 1690 1590 7 4 3 4 5
-4930 1795 1475 4 2 3 1 2
-4350 1735 1470 8 9 4 2 5
-4945 1695 1970 3 0 0 0 2
-4400 1695 1470 13 7 4 8 5
-4945 1845 1480 4 3 4 1 2
-3588 1563 1533 3 5 15 5 8
-4466 1705 1410 4 5 7 2 0
-4531 1817 1421 4 2 0 4 3
-4880 1800 1450 5 3 2 6 5
-5160 1895 1930 7 2 4 3 2
-4800 1770 1880 4 3 8 2 6
-4590 1766 1767 0 1 5 7 8
-4194 1680 1440 3 4 2 8 7
-4865 1805 1450 12 8 4 2 6
-3763 1615 1440 3 5 14 4 7
-3998 1640 1535 0 3 8 6 9
-4285 1765 1715 0 6 4 12 8
-4608 1743 1465 15 12 4 6 5
-4789 1765 1470 10 8 6 7 0
-4687 1700 1450 0 2 12 6 5
-4580 1725 1500 9 4 3 7 5
-4603 1780 1480 5 6 8 0 9
-3820 1495 1860 0 4 20 8 5
-4212 1762 1531 8 7 10 3 5
-4245 1680 1500 5 7 8 4 9
-3745 1615 1385 0 0 15 8 4
-4855 1780 1480 9 5 0 5 6
-4544 1760 1464 8 7 4 5 5
-5035 1855 1485 12 6 0 4 3
-6831 1980 1478 2 0 0 1 1
-4670 1780 1435 15 13 9 10 6
diff --git a/Mathematical-Modeling-2014/test.py b/Mathematical-Modeling-2014/test.py
deleted file mode 100644
index aea4327..0000000
--- a/Mathematical-Modeling-2014/test.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-Function:
-【教程】把Sublime Text 2用作Python的IDE去实现Python的开发
-
-http://www.crifan.com/use_sublime_text_2_as_python_ide
-
-Author: Crifan Li
-Version: 2013-02-01
-Contact: admin at crifan dot com
-"""
-
-def sublimeText2IdeDemo():
- """
- Demo how to use sublime text 2 as Python IDE
- also try to support:
- input parameter
- autocomplete
- """
- print "Demo print in Sublime Text 2"
- inputVal = 100
- #raw_input("Now in sublime text 2, please input parameter:")
- print "Your inputed parameter is ",inputVal
-
-if __name__ == "__main__":
- sublimeText2IdeDemo()
-
-
-
-
-
-
-
-
-
-
diff --git a/Mathematical-Modeling-2014/test2.py b/Mathematical-Modeling-2014/test2.py
deleted file mode 100644
index ccd1f65..0000000
--- a/Mathematical-Modeling-2014/test2.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/python
-#coding=utf-8
-# 数学建模:单辆矫运车装车方案,前四问
-# 输入为:矫运车长度,宽度
-# 输出为:装车方案
-
-# 高度超过1.7米的乘用车只能装在1-1、1-2型下层
-# 纵向及横向的安全车距均至少为0.1米
-
-Put = [0,1,0,1,0,0]
-#长 上宽 下宽
-Truck = [19.1,24.4,19.1]
-#长 宽 高
-Car = [4.71,3.715,4.73]
-
-
-for i in range(0,6):
- if Put[i] == 0:
- for j in range(0,int(Truck[i/2]/Car[0])+2):
- for k in range(0,int(Truck[i/2]/Car[1])+2):
- if j*Car[0]+k*Car[1] > Truck[i/2]:
- if k > 0 :
- print(i,j,k-1)
- break
- else:
- for j in range(0,int(Truck[i/2]/Car[0])+2):
- for k in range(0,int(Truck[i/2]/Car[1])+2):
- for l in range(0,int(Truck[i/2]/Car[2])+2):
- if j*Car[0]+k*Car[1]+l*Car[2] > Truck[i/2]:
- if l > 0 :
- print(i,j,k,l-1)
- break
-
-
-
-
-
-
-
-
diff --git a/Mathematical-Modeling-2014/test3.py b/Mathematical-Modeling-2014/test3.py
deleted file mode 100644
index 71bd68c..0000000
--- a/Mathematical-Modeling-2014/test3.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/python
-#coding=utf-8
-
-import time
-import numpy as np
-import pylab as pl
-from sklearn.cluster import KMeans
-from sklearn.metrics.pairwise import euclidean_distances
-from sklearn.datasets.samples_generator import make_blobs
-
-np.random.seed(0)
-centers = [[1,1], [-1,-1], [1, -1]]
-k = len(centers)
-x , labels = make_blobs(n_samples=3000, centers=centers, cluster_std=.7)
-
-kmeans = KMeans(init='k-means++', n_clusters=3, n_init = 10)
-t0 = time.time()
-kmeans.fit(x)
-t_end = time.time() - t0
-
-colors = ['r', 'b', 'g']
-for k , col in zip( range(k) , colors):
- members = (kmeans.labels_ == k )
- pl.plot( x[members, 0] , x[members,1] , 'w', markerfacecolor=col, marker='.')
- pl.plot(kmeans.cluster_centers_[k,0], kmeans.cluster_centers_[k,1], 'o', markerfacecolor=col,\
- markeredgecolor='k', markersize=10)
-pl.show()
-
-
-
-
-
-
-
-
diff --git a/Mathematical-Modeling-2014/test4.py b/Mathematical-Modeling-2014/test4.py
deleted file mode 100644
index 47a58c3..0000000
--- a/Mathematical-Modeling-2014/test4.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/python
-#coding=utf-8
-
-import string
-
-datafile = open("car.txt")
-
-
-n = 37
-m = 2
-mat = [[0]*m for i in range(n)]
-
-i = 0
-car = datafile.readline()
-while car:
- car_data = car.strip('\n').split(" ")
- j = 0
- for items in car_data:
- #字符串转换成数字
- data1 = string.atoi(items)
- mat[i][j] = data1
- j = j + 1
- #print data1
- i = i + 1
- car = datafile.readline()
-
-
-
-
-for i in range(n):
- for j in range(m):
- print mat[i][j],
- print
-
-
-
-from sklearn.cluster import KMeans
-
-kmeans = KMeans(init='k-means++', n_clusters = 4, n_init = 10)
-
-kmeans.fit(mat)
-
-result = kmeans.predict(mat)
-
-print result
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/README.md b/README.md
index ef79281..7df7f62 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,7 @@ This is a `Chinese tutorial` which is translated from [DeepLearning 0.1 document
这是一个翻译自[深度学习0.1文档](http://deeplearning.net/tutorial/contents.html)的`中文教程`。在这个教程里面所有的算法和模型都是通过Pyhton和[Theano](http://deeplearning.net/software/theano/index.html)实现的。Theano是一个著名的第三方库,允许程序员使用GPU或者CPU去运行他的Python代码。
-
-##内容/Contents
+## 内容/Contents
* [入门(Getting Started)](https://github.com/Syndrome777/DeepLearningTutorial/blob/master/1_Getting_Started_入门.md)
* [使用逻辑回归进行MNIST分类(Classifying MNIST digits using Logistic Regression)](https://github.com/Syndrome777/DeepLearningTutorial/blob/master/2_Classifying_MNIST_using_LR_逻辑回归进行MNIST分类.md)
@@ -27,10 +26,10 @@ This is a `Chinese tutorial` which is translated from [DeepLearning 0.1 document
* Miscellaneous
-##版权/Copyright
-####作者/Author
+## 版权/Copyright
+#### 作者/Author
[Theano Development Team](http://deeplearning.net/tutorial/LICENSE.html), LISA lab, University of Montreal
-####翻译者/Translator
+#### 翻译者/Translator
[Lifeng Hua](https://github.com/Syndrome777), Zhejiang University
diff --git a/images/.DS_Store b/images/.DS_Store
new file mode 100644
index 0000000..bf22167
Binary files /dev/null and b/images/.DS_Store differ
| |