机器学习笔记!
python基础实践 文件解压与遍历 1 2 3 4 5 6 7 8 9 10 11 12 import  zipfileimport  osdef  unzip_data (src_path,target_path ):         if (not  os.path.isdir(target_path)):              z = zipfile.ZipFile(src_path, 'r' )         z.extractall(path=target_path)         z.close() unzip_data('data/data10954/cat_12_test.zip' ,'data/data10954/cat_12_test' ) unzip_data('data/data10954/cat_12_train.zip' ,'data/data10954/cat_12_train' ) 
files = os.listdir(path)
 
os.path.join(path, filename)print( os.path.join('root','test','runoob.txt') ) 
 
参考:https://www.runoob.com/python3/python3-os-path.html 
os.path.splitext(temp_path)[1]
 
dict.setdefault(key, default=None)如果键不存在于字典中,将会添加键并
在python中 % 操作符可以实现字符串格式化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 import  os  """ 通过给定目录,统计所有的不同子文件类型及占用内存 """ size_dict = {} type_dict = {} def  get_size_type (path ):    files = os.listdir(path)          for  filename in  files:         temp_path = os.path.join(path, filename)         if  os.path.isdir(temp_path):                          get_size_type(temp_path)              elif  os.path.isfile(temp_path):                          type_name=os.path.splitext(temp_path)[1 ]                             if  not  type_name:                                  type_dict.setdefault("None" , 0 )                 type_dict["None" ] += 1                  size_dict.setdefault("None" , 0 )                 size_dict["None" ] += os.path.getsize(temp_path)                          else :                 type_dict.setdefault(type_name, 0 )                 type_dict[type_name] += 1                  size_dict.setdefault(type_name, 0 )                                  size_dict[type_name] += os.path.getsize(temp_path)        path= "data/"  get_size_type(path)  for  each_type in  type_dict.keys():    print  ("%5s下共有【%5s】的文件【%5d】个,占用内存【%7.2f】MB"  %                  (path,each_type,type_dict[each_type],\             size_dict[each_type]/(1024 *1024 ))) print ("总文件数:  【%d】" %(sum (type_dict.values())))print ("总内存大小:【%.2f】GB" %(sum (size_dict.values())/(1024 **3 )))
简单计算器的实现 图像发布直方图 文本词频分析 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 import  jieba with  open ('data/data131368/test.txt' , 'r' , encoding='UTF-8' ) as  novelFile:    novel = novelFile.read() stopwords = [line.strip() for  line in  open ('data/data131368/stop.txt' , 'r' , encoding='UTF-8' ).readlines()] novelList = list (jieba.lcut(novel)) novelDict = {} for  word in  novelList:    if  word not  in  stopwords:                          if  len (word) == 1 :                 continue              else :                 novelDict[word] = novelDict.get(word, 0 ) + 1  novelListSorted = list (novelDict.items()) novelListSorted.sort(key=lambda  e: e[1 ], reverse=True ) topWordNum = 0  for  topWordTup in  novelListSorted[:10 ]:    print (topWordTup) from  matplotlib import  pyplot as  pltx = [c for  c,v in  novelListSorted] y = [v for  c,v in  novelListSorted] plt.plot(x[:10 ],y[:10 ],color='r' ) plt.show() 
数据爬取与分析 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 import  requestsimport  osimport  urllibclass  GetImage ():    def  __init__ (self,keyword='大雁' ,paginator=1  ):                  self.url = 'http://image.baidu.com/search/acjson?'          self.headers = {             'User-Agent' : 'Mozilla/5.0 (Windows NT\               10.0; WOW64) AppleWebKit/537.36\               (KHTML, like Gecko) Chrome/69.0.\             3497.81 Safari/537.36' }        self.headers_image = {             'User-Agent' : 'Mozilla/5.0 (Windows\               NT 10.0; WOW64) AppleWebKit/537.36 \              (KHTML, like Gecko) Chrome/69.0.\             3497.81 Safari/537.36' ,            'Referer' : 'http://image.baidu.com/\              search/index?tn=baiduimage&ipn=r&\             ct=201326592&cl=2&lm=-1&st=-1&\             fm=result&fr=&sf=1&fmq=1557124645631_R&\             pv=&ic=&nc=1&z=&hd=1&latest=0©right\             =0&se=1&showtab=0&fb=0&width=&height=\             &face=0&istype=2&ie=utf-8&sid=&word=%\             E8%83%A1%E6%AD%8C' }        self.keyword = keyword               self.paginator = paginator       def  get_param (self ):                  keyword = urllib.parse.quote(self.keyword)         params = []                  for  i in  range (1 , self.paginator + 1 ):             params.append(                 'tn=resultjson_com&ipn=rj&ct=201326592&is=&\                  fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&o\                 e=utf-8&adpicid=&st=-1&z=&ic=&hd=1&latest=0&\                 copyright=0&word={}&s=&se=&tab=&width=&height\                 =&face=0&istype=2&qc=&nc=1&fr=&expermode=&for\                 ce=&cg=star&pn={}&rn=30&gsm=78&1557125391211\                 =' .format (keyword, keyword, 30  * i))        return  params        def  get_urls (self, params ):         urls = []         for  param in  params:                          urls.append(self.url + param)         return  urls        def  get_image_url (self, urls ):         image_url = []         for  url in  urls:             json_data = requests.get(url, headers=self.headers).json()             json_data = json_data.get('data' )             for  i in  json_data:                 if  i:                     image_url.append(i.get('thumbURL' ))         return  image_url     def  get_image (self, image_url ):         """          根据图片url,在本地目录下新建一个以搜索关键字命名的文件夹,然后将每一个图片存入。         :param image_url:         :return:         """         cwd = os.getcwd()         file_name = os.path.join(cwd, self.keyword)         if  not  os.path.exists(self.keyword):             os.mkdir(file_name)         for  index, url in  enumerate (image_url, start=1 ):             with  open (file_name+'/{}_0.jpg' .format (index), 'wb' ) as  f:                 f.write(requests.get(url, headers=self.headers_image).content)             if  index != 0  and  index % 30  == 0 :                 print ('第{}页下载完成' .format (index/30 ))     def  __call__ (self, *args, **kwargs ):         params = self.get_param()           urls = self.get_urls(params)         image_url = self.get_image_url(urls)         self.get_image(image_url) if  __name__ == '__main__' :    spider = GetImage('二次元' , 3 )     spider() 
基于线性回归实现房价预测 基于逻辑回归模型实现手写数字识别