机器学习笔记!
python基础实践 文件解压与遍历 1 2 3 4 5 6 7 8 9 10 11 12 import zipfileimport osdef unzip_data (src_path,target_path ): if (not os.path.isdir(target_path)): z = zipfile.ZipFile(src_path, 'r' ) z.extractall(path=target_path) z.close() unzip_data('data/data10954/cat_12_test.zip' ,'data/data10954/cat_12_test' ) unzip_data('data/data10954/cat_12_train.zip' ,'data/data10954/cat_12_train' )
files = os.listdir(path)
返回指定路径下的文件和文件夹列表
os.path.join(path, filename)
把目录和文件名合成一个路径print( os.path.join('root','test','runoob.txt') )
输出:root/test/runoob.txt
参考:https://www.runoob.com/python3/python3-os-path.html
os.path.splitext(temp_path)[1]
分割路径中的文件名与拓展名
dict.setdefault(key, default=None)
如果键不存在于字典中,将会添加键并
在python中 % 操作符可以实现字符串格式化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 import os """ 通过给定目录,统计所有的不同子文件类型及占用内存 """ size_dict = {} type_dict = {} def get_size_type (path ): files = os.listdir(path) for filename in files: temp_path = os.path.join(path, filename) if os.path.isdir(temp_path): get_size_type(temp_path) elif os.path.isfile(temp_path): type_name=os.path.splitext(temp_path)[1 ] if not type_name: type_dict.setdefault("None" , 0 ) type_dict["None" ] += 1 size_dict.setdefault("None" , 0 ) size_dict["None" ] += os.path.getsize(temp_path) else : type_dict.setdefault(type_name, 0 ) type_dict[type_name] += 1 size_dict.setdefault(type_name, 0 ) size_dict[type_name] += os.path.getsize(temp_path) path= "data/" get_size_type(path) for each_type in type_dict.keys(): print ("%5s下共有【%5s】的文件【%5d】个,占用内存【%7.2f】MB" % (path,each_type,type_dict[each_type],\ size_dict[each_type]/(1024 *1024 ))) print ("总文件数: 【%d】" %(sum (type_dict.values())))print ("总内存大小:【%.2f】GB" %(sum (size_dict.values())/(1024 **3 )))
简单计算器的实现 图像发布直方图 文本词频分析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 import jieba with open ('data/data131368/test.txt' , 'r' , encoding='UTF-8' ) as novelFile: novel = novelFile.read() stopwords = [line.strip() for line in open ('data/data131368/stop.txt' , 'r' , encoding='UTF-8' ).readlines()] novelList = list (jieba.lcut(novel)) novelDict = {} for word in novelList: if word not in stopwords: if len (word) == 1 : continue else : novelDict[word] = novelDict.get(word, 0 ) + 1 novelListSorted = list (novelDict.items()) novelListSorted.sort(key=lambda e: e[1 ], reverse=True ) topWordNum = 0 for topWordTup in novelListSorted[:10 ]: print (topWordTup) from matplotlib import pyplot as pltx = [c for c,v in novelListSorted] y = [v for c,v in novelListSorted] plt.plot(x[:10 ],y[:10 ],color='r' ) plt.show()
数据爬取与分析 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 import requestsimport osimport urllibclass GetImage (): def __init__ (self,keyword='大雁' ,paginator=1 ): self.url = 'http://image.baidu.com/search/acjson?' self.headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT\ 10.0; WOW64) AppleWebKit/537.36\ (KHTML, like Gecko) Chrome/69.0.\ 3497.81 Safari/537.36' } self.headers_image = { 'User-Agent' : 'Mozilla/5.0 (Windows\ NT 10.0; WOW64) AppleWebKit/537.36 \ (KHTML, like Gecko) Chrome/69.0.\ 3497.81 Safari/537.36' , 'Referer' : 'http://image.baidu.com/\ search/index?tn=baiduimage&ipn=r&\ ct=201326592&cl=2&lm=-1&st=-1&\ fm=result&fr=&sf=1&fmq=1557124645631_R&\ pv=&ic=&nc=1&z=&hd=1&latest=0©right\ =0&se=1&showtab=0&fb=0&width=&height=\ &face=0&istype=2&ie=utf-8&sid=&word=%\ E8%83%A1%E6%AD%8C' } self.keyword = keyword self.paginator = paginator def get_param (self ): keyword = urllib.parse.quote(self.keyword) params = [] for i in range (1 , self.paginator + 1 ): params.append( 'tn=resultjson_com&ipn=rj&ct=201326592&is=&\ fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&o\ e=utf-8&adpicid=&st=-1&z=&ic=&hd=1&latest=0&\ copyright=0&word={}&s=&se=&tab=&width=&height\ =&face=0&istype=2&qc=&nc=1&fr=&expermode=&for\ ce=&cg=star&pn={}&rn=30&gsm=78&1557125391211\ =' .format (keyword, keyword, 30 * i)) return params def get_urls (self, params ): urls = [] for param in params: urls.append(self.url + param) return urls def get_image_url (self, urls ): image_url = [] for url in urls: json_data = requests.get(url, headers=self.headers).json() json_data = json_data.get('data' ) for i in json_data: if i: image_url.append(i.get('thumbURL' )) return image_url def get_image (self, image_url ): """ 根据图片url,在本地目录下新建一个以搜索关键字命名的文件夹,然后将每一个图片存入。 :param image_url: :return: """ cwd = os.getcwd() file_name = os.path.join(cwd, self.keyword) if not os.path.exists(self.keyword): os.mkdir(file_name) for index, url in enumerate (image_url, start=1 ): with open (file_name+'/{}_0.jpg' .format (index), 'wb' ) as f: f.write(requests.get(url, headers=self.headers_image).content) if index != 0 and index % 30 == 0 : print ('第{}页下载完成' .format (index/30 )) def __call__ (self, *args, **kwargs ): params = self.get_param() urls = self.get_urls(params) image_url = self.get_image_url(urls) self.get_image(image_url) if __name__ == '__main__' : spider = GetImage('二次元' , 3 ) spider()
基于线性回归实现房价预测 基于逻辑回归模型实现手写数字识别