您好,欢迎来到三六零分类信息网!老站,搜索引擎当天收录,欢迎发信息
免费发信息

python抓取网页中图片并保存到本地

2024/4/16 20:45:57发布7次查看
在上篇文章给大家分享php源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。
#-*-coding:utf-8-*- import osimport uuidimport urllib2import cookielib'''获取文件后缀名'''def get_file_extension(file): return os.path.splitext(file)[1] '''創建文件目录,并返回该目录'''def mkdir(path): # 去除左右两边的空格 path=path.strip() # 去除尾部 \符号 path=path.rstrip(\\) if not os.path.exists(path): os.makedirs(path) return path'''自动生成一个唯一的字符串,固定长度为36'''def unique_str(): return str(uuid.uuid1())'''抓取网页文件内容,保存到内存@url 欲抓取文件 ,path+filename'''def get_file(url): try: cj=cookielib.lwpcookiejar() opener=urllib2.build_opener(urllib2.httpcookieprocessor(cj)) urllib2.install_opener(opener) req=urllib2.request(url) operate=opener.open(req) data=operate.read() return data except baseexception, e: print e return none'''保存文件到本地@path 本地路径@file_name 文件名@data 文件内容'''def save_file(path, file_name, data): if data == none: return mkdir(path) if(not path.endswith(/)): path=path+/ file=open(path+file_name, wb) file.write(data) file.flush() file.close()#获取文件后缀名print get_file_extension(123.jpg);#創建文件目录,并返回该目录#print mkdir(d:/ljq)#自动生成一个唯一的字符串,固定长度为36print unique_str()url=http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0;save_file(d:/ljq/, 123.jpg, get_file(url))
通过python抓取指定url中的图片保存至本地
# *** encoding: utf-8 ***__author__='jiangyt' fetch images from specific urlv1.0 import urllib, httplib, urlparse import re import random judge url exists or not def httpexists(url): host, path = urlparse.urlsplit(url)[1:3] if ':' in host: # port specified, try to use it host, port = host.split(':', 1) try: port = int(port) except valueerror: print 'invalid port number %r' % (port,) return false else: # no port specified, use default port port = none try: connection = httplib.httpconnection(host, port=port) connection.request(head, path) resp = connection.getresponse( ) if resp.status == 200: # normal 'found' status found = true elif resp.status == 302: # recurse on temporary redirect found = httpexists(urlparse.urljoin(url,resp.getheader('location', ''))) else: # everything else -> not found print status %d %s : %s % (resp.status, resp.reason, url) found = false except exception, e: print e.__class__, e, url found = false return found get html src,return lines[] def ggethtmllines(url): if url==none : return if not httpexists(url): return try: page = urllib.urlopen(url) html = page.readlines() page.close() return html except exception, e: print ggethtmllines() error! exception ==>> + e return get html src,return string def ggethtml(url): if url==none : return if not httpexists(url): return try: page = urllib.urlopen(url) html = page.read() page.close() return html except exception, e: print ggethtml() error! exception ==>> + e return 根据url获取文件名 def ggetfilename(url): if url==none: return none if url== : return arr=url.split(/) return arr[len(arr)-1] 生成随机文件名 def grandfilename(type): fname = '' for i in range(16): fname = fname + chr(random.randint(65,90)) fname = fname + chr(random.randint(48,57)) return fname + '.' + type 根据url和其上的link,得到link的绝对地址 def ggetabsllink(url,link): if url==none or link == none : return if url=='' or link=='' : return url addr = '' if link[0] == '/' : addr = ggethttpaddr(url) + link elif len(link)>3 and link[0:4] == 'http': addr = link elif len(link)>2 and link[0:2] == '..': addr = ggethttpaddrfatherassign(url,link) else: addr = ggethttpaddrfather(url) + link return addr 根据输入的lines,匹配正则表达式,返回list def ggetreglist(lineslist,regx): if lineslist==none : return rtnlist=[] for line in lineslist: matchs = re.search(regx, line, re.ignorecase) if matchs!=none: allgroups = matchs.groups() for foundstr in allgroups: if foundstr not in rtnlist: rtnlist.append(foundstr) return rtnlist 根据url下载文件,文件名参数指定 def gdownloadwithfilename(url,savepath,file): #参数检查,现忽略 try: urlopen=urllib.urlopener() fp = urlopen.open(url) data = fp.read() fp.close() file=open(savepath + file,'w+b') file.write(data) file.close() except ioerror, error: print download %s error!==>>%s % (url, error) except exception, e: print exception==>> + e 根据url下载文件,文件名自动从url获取 def gdownload(url,savepath): #参数检查,现忽略 filename = ggetfilename(url) #filename =grandfilename('jpg') gdownloadwithfilename(url,savepath,filename) 根据某网页的url,下载该网页的jpg def gdownloadhtmljpg(downloadurl,savepath): lines= ggethtmllines(downloadurl) # 'get the page source' regx = rsrc\s*=?(\s+)\.jpg lists =ggetreglist(lines,regx) #'get the links which match regular express' if lists==none: return for jpg in lists: jpg = ggetabsllink(downloadurl, jpg) + '.jpg' gdownload(jpg,savepath) print ggetfilename(jpg) 根据url取主站地址 def ggethttpaddr(url): if url== '' : return '' arr=url.split(/) return arr[0]+//+arr[2] 根据url取上级目录 def ggethttpaddrfather(url): if url=='' : return '' arr=url.split(/) addr = arr[0]+'//'+arr[2]+ '/' if len(arr)-1>3 : for i in range(3,len(arr)-1): addr = addr + arr[i] + '/' return addr 根据url和上级的link取link的绝对地址 def ggethttpaddrfatherassign(url,link): if url=='' : return '' if link=='': return '' linkarray=link.split(/) urlarray = url.split(/) partlink ='' parturl = '' for i in range(len(linkarray)): if linkarray[i]=='..': numoffather = i + 1 #上级数 else: partlink = partlink + '/' + linkarray[i] for i in range(len(urlarray)-1-numoffather): parturl = parturl + urlarray[i] if i < len(urlarray)-1-numoffather -1 : parturl = parturl + '/' return parturl + partlink 根据url获取其上的相关htm、html链接,返回list def ggethtmllink(url): #参数检查,现忽略 rtnlist=[] lines=ggethtmllines(url) regx = rhref=?(\s+)\.htm for link in ggetreglist(lines,regx): link = ggetabsllink(url,link) + '.htm' if link not in rtnlist: rtnlist.append(link) print link return rtnlist 根据url,抓取其上的jpg和其链接htm上的jpg def gdownloadalljpg(url,savepath): #参数检查,现忽略 gdownloadhtmljpg(url,savepath) #抓取link上的jpg links=ggethtmllink(url) for link in links: gdownloadhtmljpg(link,savepath) test def main(): u='http://site.douban.com/196738/room/2462453/'#想要抓取图片的地址 save='/root/python/tmp/' #图片所要存放的目录 print 'download pic from [' + u +']' print 'save to [' +save+'] ...' gdownloadhtmljpg(u,save) print download finished if __name__ == __main__: main()else: print called from intern.
以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。
该用户其它信息

VIP推荐

免费发布信息,免费发布B2B信息网站平台 - 三六零分类信息网 沪ICP备09012988号-2
企业名录