Python – 从谷歌图片search下载图片?
我想使用python下载谷歌图片search的所有图片。 我使用的代码似乎有一些问题。我的代码是
import os import sys import time from urllib import FancyURLopener import urllib2 import simplejson # Define search term searchTerm = "parrot" # Replace spaces ' ' in search term for '%20' in order to comply with request searchTerm = searchTerm.replace(' ','%20') # Start FancyURLopener with defined version class MyOpener(FancyURLopener): version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11' myopener = MyOpener() # Set count to 0 count= 0 for i in range(0,10): # Notice that the start changes for each iteration in order to request a new set of images for each loop url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0& q='+searchTerm+'&start='+str(i*10)+'&userip=MyIP') print url request = urllib2.Request(url, None, {'Referer': 'testing'}) response = urllib2.urlopen(request) # Get results using JSON results = simplejson.load(response) data = results['responseData'] dataInfo = data['results'] # Iterate for each result and get unescaped url for myUrl in dataInfo: count = count + 1 my_url = myUrl['unescapedUrl'] myopener.retrieve(myUrl['unescapedUrl'],str(count)+'.jpg')
下载几页后,我得到一个错误,如下所示:
回溯(最近一次通话最后):
File "C:\Python27\img_google3.py", line 37, in <module> dataInfo = data['results'] TypeError: 'NoneType' object has no attribute '__getitem__'
该怎么办 ??????
我修改了我的代码。 现在,代码可以下载给定的查询100个图像,图像是全高分辨率,正在下载原始图像。
我正在使用urllib2&美丽的汤下载图像
from bs4 import BeautifulSoup import requests import re import urllib2 import os import cookielib import json def get_soup(url,header): return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser') query = raw_input("query image")# you can change the query for the image here image_type="ActiOn" query= query.split() query='+'.join(query) url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch" print url #add the directory for your image here DIR="Pictures" header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36" } soup = get_soup(url,header) ActualImages=[]# contains the link for Large original images, type of image for a in soup.find_all("div",{"class":"rg_meta"}): link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"] ActualImages.append((link,Type)) print "there are total" , len(ActualImages),"images" if not os.path.exists(DIR): os.mkdir(DIR) DIR = os.path.join(DIR, query.split()[0]) if not os.path.exists(DIR): os.mkdir(DIR) ###print images for i , (img , Type) in enumerate( ActualImages): try: req = urllib2.Request(img, headers={'User-Agent' : header}) raw_img = urllib2.urlopen(req).read() cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1 print cntr if len(Type)==0: f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+".jpg"), 'wb') else : f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+"."+Type), 'wb') f.write(raw_img) f.close() except Exception as e: print "could not load : "+img print e
我希望这可以帮助你
Google图片searchAPI已被弃用 ,您需要使用Google自定义search才能获得您想要的内容。 要获取图像,你需要做到这一点:
import urllib2 import simplejson import cStringIO fetcher = urllib2.build_opener() searchTerm = 'parrot' startIndex = 0 searchUrl = "http://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" + searchTerm + "&start=" + startIndex f = fetcher.open(searchUrl) deserialized_output = simplejson.load(f)
这会给你4个结果,因为JSON,你需要通过递增API请求中的startIndex
迭代地得到结果。
要获得图像,你需要使用像cStringIO这样的库。
例如,要访问第一个图像,您需要这样做:
imageUrl = deserialized_output['responseData']['results'][0]['unescapedUrl'] file = cStringIO.StringIO(urllib.urlopen(imageUrl).read()) img = Image.open(file)
谷歌弃用他们的API,刮谷歌是复杂的,所以我会build议使用Bing API:
https://datamarket.azure.com/dataset/5BA839F1-12CE-4CCE-BF57-A49D98D29A44
谷歌不太好,微软也不是那么邪恶
没有看过你的代码,但这是一个用selenium做的例子,试图从search词中获得400张图片
# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.keys import Keys import json import os import urllib2 searchterm = 'vannmelon' # will also be the name of the folder url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch" browser = webdriver.Firefox() browser.get(url) header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"} counter = 0 succounter = 0 if not os.path.exists(searchterm): os.mkdir(searchterm) for _ in range(500): browser.execute_script("window.scrollBy(0,10000)") for x in browser.find_elements_by_xpath("//div[@class='rg_meta']"): counter = counter + 1 print "Total Count:", counter print "Succsessful Count:", succounter print "URL:",json.loads(x.get_attribute('innerHTML'))["ou"] img = json.loads(x.get_attribute('innerHTML'))["ou"] imgtype = json.loads(x.get_attribute('innerHTML'))["ity"] try: req = urllib2.Request(img, headers={'User-Agent': header}) raw_img = urllib2.urlopen(req).read() File = open(os.path.join(searchterm , searchterm + "_" + str(counter) + "." + imgtype), "wb") File.write(raw_img) File.close() succounter = succounter + 1 except: print "can't get img" print succounter, "pictures succesfully downloaded" browser.close()
你也可以用Python来使用Selenium。 这里是如何:
from selenium import webdriver import urllib from selenium.webdriver.common.keys import Keys driver = webdriver.Chrome('C:/Python27/Scripts/chromedriver.exe') word="apple" url="http://images.google.com/search?q="+word+"&tbm=isch&sout=1" driver.get(url) imageXpathSelector='//*[@id="ires"]/table/tbody/tr[1]/td[1]/a/img' img=driver.find_element_by_xpath(imageXpathSelector) src=(img.get_attribute('src')) urllib.urlretrieve(src, word+".jpg") driver.close()
(此代码适用于Python 2.7)请注意,您应该使用“ pip install selenium ”安装Selenium软件包,您应该从这里下载chromedriver.exe
与其他网页抓取技术相反,Selenium打开浏览器并下载这些项目,因为Selenium的任务是testing而不是抓取。
添加Piees的答案 ,为了从search结果中下载任意数量的图像,我们需要在加载前400个结果后模拟点击“显示更多结果”button。
from selenium import webdriver from selenium.webdriver.common.keys import Keys import os import json import urllib2 import sys import time # adding path to geckodriver to the OS environment variable # assuming that it is stored at the same path as this script os.environ["PATH"] += os.pathsep + os.getcwd() download_path = "dataset/" def main(): searchtext = sys.argv[1] # the search query num_requested = int(sys.argv[2]) # number of images to download number_of_scrolls = num_requested / 400 + 1 # number_of_scrolls * 400 images will be opened in the browser if not os.path.exists(download_path + searchtext.replace(" ", "_")): os.makedirs(download_path + searchtext.replace(" ", "_")) url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch" driver = webdriver.Firefox() driver.get(url) headers = {} headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" extensions = {"jpg", "jpeg", "png", "gif"} img_count = 0 downloaded_img_count = 0 for _ in xrange(number_of_scrolls): for __ in xrange(10): # multiple scrolls needed to show all 400 images driver.execute_script("window.scrollBy(0, 1000000)") time.sleep(0.2) # to load next 400 images time.sleep(0.5) try: driver.find_element_by_xpath("//input[@value='Show more results']").click() except Exception as e: print "Less images found:", e break # imges = driver.find_elements_by_xpath('//div[@class="rg_meta"]') # not working anymore imges = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]') print "Total images:", len(imges), "\n" for img in imges: img_count += 1 img_url = json.loads(img.get_attribute('innerHTML'))["ou"] img_type = json.loads(img.get_attribute('innerHTML'))["ity"] print "Downloading image", img_count, ": ", img_url try: if img_type not in extensions: img_type = "jpg" req = urllib2.Request(img_url, headers=headers) raw_img = urllib2.urlopen(req).read() f = open(download_path+searchtext.replace(" ", "_")+"/"+str(downloaded_img_count)+"."+img_type, "wb") f.write(raw_img) f.close downloaded_img_count += 1 except Exception as e: print "Download failed:", e finally: print if downloaded_img_count >= num_requested: break print "Total downloaded: ", downloaded_img_count, "/", img_count driver.quit() if __name__ == "__main__": main()
完整的代码在这里 。
我知道这个问题很老,但我最近遇到了这个问题,以前的答案都没有了。 所以我写了这个脚本从谷歌收集图像。 截至目前,它可以下载尽可能多的图像。
这里是一个github链接,以及https://github.com/CumminUp07/imengine/blob/master/get_google_images.py
免责声明:由于版权问题,收集的图像只能用于研究和教育目的
from bs4 import BeautifulSoup as Soup import urllib2 import json import urllib #programtically go through google image ajax json return and save links to list# #num_images is more of a suggestion # #it will get the ceiling of the nearest 100 if available # def get_links(query_string, num_images): #initialize place for links links = [] #step by 100 because each return gives up to 100 links for i in range(0,num_images,100): url = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q='+query_string+'\ &tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start='+str(i)+'\ &yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s' #set user agent to avoid 403 error request = urllib2.Request(url, None, {'User-Agent': 'Mozilla/5.0'}) #returns json formatted string of the html json_string = urllib2.urlopen(request).read() #parse as json page = json.loads(json_string) #html found here html = page[1][1] #use BeautifulSoup to parse as html new_soup = Soup(html,'lxml') #all img tags, only returns results of search imgs = new_soup.find_all('img') #loop through images and put src in links list for j in range(len(imgs)): links.append(imgs[j]["src"]) return links #download images # #takes list of links, directory to save to # #and prefix for file names # #saves images in directory as a one up number # #with prefix added # #all images will be .jpg # def get_images(links,directory,pre): for i in range(len(links)): urllib.urlretrieve(links[i], "./"+directory+"/"+str(pre)+str(i)+".jpg") #main function to search images # #takes two lists, base term and secondary terms # #also takes number of images to download per # #combination # #it runs every combination of search terms # #with base term first then secondary # def search_images(base,terms,num_images): for y in range(len(base)): for x in range(len(terms)): all_links = get_links(base[y]+'+'+terms[x],num_images) get_images(all_links,"images",x) if __name__ == '__main__': terms = ["cars","numbers","scenery","people","dogs","cats","animals"] base = ["animated"] search_images(base,terms,1000)