⭐公告:务必收藏好 我们的永久发布页 www.jiuandun.net.cn⭐
zhouxinhuagg

[Python] 一键爬取TTB网站美图

作者: zhouxinhuagg 发布时间: 2019-12-26 3.27K 人阅读

初学Python3,看到一个美图网站,顺手写了一个脚本,自动爬取网站套图存取MySQL数据库中。

准备工作:
1、MySQL建表,共计3个:

存套图集信息

存套图集下的套图信息

存套图下的图片信息

第一个表:ttb_atlas

IDint主键,自增
AtlasNamevarchar套图集名称
AtlasUrlvarchar套图集链接地址

 

第二个表:ttb_album

IDint主键,自增
AtlasNamevarchar套图集名称
AlbumNamevarchar套图名称
AlbumUrlvarchar套图链接地址
AlbumPicUrlvarchar套图预览照片链接
AlbumPicDatedate套图日期

 

第三个表:ttb_photo

IDint主键,自增
AlbumNamevarchar套图名称
PhotoNamevarchar图片名称
PhotoUrlvarchar图片链接地址

 

2、创建主函数程序ttb_lk.py

#!/usr/local/Cellar/python/3.7.1/bin# -*- coding: UTF-8 -*-import syssys.path.append("/Python")import conf.ttb_manage as myconfimport conf.mysql_db as mysqldb home_url = 'https://www.192td.com'def main(): #第一步:将首页套图集索引插入数据库 myconf.get_home_atlas(home_url) #第二步:获取每个套图集的每套图信息,包括:套图名称、套图链接、套图页数 db = mysqldb.Database() sql = 'select * from ttb_atlas' results = db.fetch_all(sql) for row in results: myconf.get_atlas_album(row['AtlasName'], row['AtlasUrl']) print(row) db.close() #第三步:获取每套图的图片信息,包括:图片名称、图片链接 db = mysqldb.Database() sql = 'select * from ttb_album' results = db.fetch_all(sql) for row in results: myconf.get_album_photo(row['AlbumName'], row['AlbumUrl']) db.close() if __name__ == '__main__': main()

 

3、创建程序ttb_manage.py

#!/usr/local/Cellar/python/3.7.1/bin# -*- coding: UTF-8 -*-import sys,requests,re,time,threadingfrom bs4 import BeautifulSoupsys.path.append("/Python")import conf.mysql_db as mysqldb #======================================================#获取首页套图集信息def get_home_atlas(home_url): # html = open("ttb.html", "r").read() html = get_html(home_url) soup = BeautifulSoup(html, "lxml") db = mysqldb.Database() try: for ul in soup.find_all(class_ = 'childnav'): for li in ul:sql = "insert into ttb_atlas(AtlasName,AtlasUrl) values('%s','%s')"%(li.string,li.a['href']) db.execute(sql)except Exception: print(Exception) db.close() return True #获取套图集下的套图信息:1def get_atlas_album(AtlasName,AtlasUrl): # 第一步:获取套图的信息:套图页数、每页套图链接 html = get_html(AtlasUrl) soup = BeautifulSoup(html, "lxml") #获取套图集页数 link = soup.find('a',string='尾页').get('href') pages = int(re.findall('_(\d+).html',link,re.S)[0]) #获取套图集每页的链接 for page in range(pages,0,-1): if page == 1 : Page_Url = AtlasUrl print(AtlasName + ' URL:' + Page_Url) get_atlas_album_html(AtlasName,AtlasUrl,Page_Url) else: Page_Url = AtlasUrl +'index_' + str(page) + '.html' print(AtlasName + ' URL:' + Page_Url) get_atlas_album_html(AtlasName,AtlasUrl,Page_Url) time.sleep(1) return True #获取套图集下的套图信息:2def get_atlas_album_html(AtlasName,AtlasUrl,Page_Url): # 第二步:获取每页套图集信息:套图名称、套图链接、封面图片 html = get_html(Page_Url) soup = BeautifulSoup(html, "lxml") db = mysqldb.Database() for ul in soup.find(class_='clearfix'): try: AlbumUrl = AtlasUrl + re.findall('\/(\w+.html)',ul.a['href'],re.S)[0]AlbumPicDate = ul.b.string AlbumName = ul.span.string AlbumPicUrl = ul.img['lazysrc'] sql = "insert into ttb_album(AtlasName,AlbumName,AlbumUrl,AlbumPicUrl,AlbumPicDate) values('%s','%s','%s','%s','%s')"%(AtlasName,AlbumName,AlbumUrl,AlbumPicUrl,AlbumPicDate) db.execute(sql)print(AlbumName + ' URL:' + AlbumUrl + ' 插入成功!')except Exception: print(Exception) db.close() return True #获取套图下的每套图片信息def get_album_photo(AlbumName,AlbumUrl):html = fread('ttb.html') soup = BeautifulSoup(html, "lxml") #获取第一页的图片信息与套图页数,将第一页信息插入数据库 PhotoName = soup.img['alt'] PhotoUrl = soup.img['lazysrc'] PhtoNum = soup.find('span', id='allnum').get_text() db = mysqldb.Database() sql = "insert into ttb_photo(AlbumName,PhotoName,PhotoUrl) values('%s','%s','%s')" % (AlbumName, PhotoName, PhotoUrl) db.execute(sql) print("第1张:" + PhotoName + ' URL:' +PhotoUrl) db.close() # 获取后面页数的图片信息,插入数据库 for i in range(2, int(PhtoNum) + 1): url = AlbumUrl[:-5] + "_" + format(i) + ".html" th = threading.Thread(target=get_img_insert, args=(i,AlbumName,url)) # ts.append(th) th.start() time.sleep(0.5) return True #插入图片信息def get_img_insert(i,AlbumName,url): html = get_html(url) # html = fread('ttb.html') soup = BeautifulSoup(html, "lxml") PhotoName = soup.img['alt'] PhotoUrl = soup.img['lazysrc'] db = mysqldb.Database() print("第"+ format(i) +"张:" + PhotoName + ' URL:' +PhotoUrl) sql = "insert into ttb_photo(AlbumName,PhotoName,PhotoUrl) values('%s','%s','%s')"%(AlbumName,PhotoName,PhotoUrl) db.execute(sql) db.close() return #======================================================#获取网页信息,得到的html就是网页的源代码,传url,返回htmldef get_html(url): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept - Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36', } resp = requests.get(url,headers=headers) resp.encoding='utf-8' html = resp.textreturn html

 

4、创建程序mysql_db.py(注意修改mysql连接的参数)

#!/usr/local/Cellar/python/3.7.1/bin# -*- coding: UTF-8 -*-import mysql.connectorimport logging # 加入日志# 获取logger实例logger = logging.getLogger("dbSql")# 指定输出格式formatter = logging.Formatter('%(asctime)s%(levelname)-8s:%(message)s') #数据库操作类class Database: # 构造函数 def __init__(self): self._dbhost = 'localhost' # 数据库主机地址 self._dbuser = 'root' # 数据库用户名 self._dbpassword = 'root' # 数据库密码 self._dbname = 'lk' # 数据库名称 self._dbcharset = 'utf8' # 数据库编码 self._conn = self.connectMysql() if (self._conn): self._cursor = self._conn.cursor() # 数据库连接 def connectMysql(self): conn =False try: # self._conn = mysql.connector.connect( conn = mysql.connector.connect( host=self._dbhost, user=self._dbuser, passwd=self._dbpassword, database=self._dbname, charset=self._dbcharset, ) except Exception: # self._logger.error("connect database failed, %s" % data) logger.error("connect database failed!") conn =False # self._cursor = self._conn.cursor() return conn # 直接执行SQL语句 def execute(self, sql): flag = False if (self._conn): try: self._cursor.execute(sql) self._conn.commit() flag = True except Exception: flag = False logger.warning("update database exception SQL=" + sql) return flag # 查询所有数据,带字段名 def fetch_all(self, sql): result = '' if (self._conn): try: self._cursor = self._conn.cursor(dictionary=True) self._cursor.execute(sql) result = self._cursor.fetchall() except Exception: result = False logger.warning("query database exception SQL=" + sql) return result # 查询所有数据,不带字段名 def fetchall(self, sql): result = '' if (self._conn): try: self._cursor.execute(sql) result = self._cursor.fetchall() except Exception: result = False logger.warning("query database exception SQL=" + sql) return result # 查询一条数据,带字段名 def fetch_one(self, sql): result = '' if (self._conn): try: self._cursor = self._conn.cursor(dictionary=True) self._cursor.execute(sql) result = self._cursor.fetchone() except Exception: result = False logger.warning("query database exception SQL=" + sql) return result # 查询一条数据,不带字段名 def fetchone(self, sql): result = '' if (self._conn): try: self._cursor.execute(sql) result = self._cursor.fetchone() except Exception: result = False logger.warning("query database exception SQL=" + sql) return result # 关闭数据库连接 def close(self): # 如果数据打开,则关闭;否则没有操作 if (self._conn): try: if (type(self._cursor) == 'object'): self._cursor.close() if (type(self._conn) == 'object'): self._conn.close() except Exception: # self._logger.warn("close database exception, %s,%s,%s" % (data, type(self._cursor), type(self._conn))) logger.warning("close database exception,%s,%s" % ( type(self._cursor), type(self._conn))) return True
本文最后更新于2019年12月26日,若涉及的内容可能已经失效,直接留言反馈补链即可,我们会处理,谢谢
本站所有资源收集于网络,如有侵权违规请联系联系客服处理删帖,谢谢
52草根资源 » [Python] 一键爬取TTB网站美图

常见问题FAQ

1.关于新手解压出错 必看(附电脑+安卓WINRAR APP)
新手必看 本站资源解压教程:http://www.52cgzys.com/76304/
2.本站Telegram群组链接
本站Telegram群组链接:https://t.me/joinchat/ElyDb9Es_YNjYjdl
3.所有礼包码下载地址:http://www.52cgzys.com/422289/
所有礼包码下载地址:http://www.52cgzys.com/422289
4.各类问题及解决处理方法合集
各类问题及解决处理方法合集:http://www.52cgzys.com/zhanwu/xinshou/

发表评论

提供最优质的资源集合

立即查看 申请友链