Dictionary python specific key -
i have code scrape out specific web page, want build code can me know specific details, example if enter style id, should give me details related it, or if enter category, should give me items in category details. code is:-
import requests, re bs4 import beautifulsoup url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2" r=requests.get(url) soup=beautifulsoup(r.content) links=soup.find_all("a") img=soup.find(itemprop="image") g_d4=soup.find_all("ol", {"class":"breadcrumb"}) item in g_d4: links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/')) pattern_2=re.compile("clothing/(\w+)") link in links_2: match_1=pattern_2.search(link["href"]) if match_1: print ("category:- " + match_1.group(1)) break g_d1 = soup.find_all("div", {"id": "product-content"}) item in g_d1: try: print ("\n\nbrand:-" + item.contents[1].text) except: pass try: a_1=item.find("ol", {"class":"breadcrumb"}) a_2=a_1.text print a_2 except: pass try: print ("type:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';') except: pass try: d2=item.find("div",{"class":"panel-body standard-p"}) d3=d2.text p_id=re.findall(r'[0-9]{9}',d3) id_2=p_id[0] url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7citem_page.rr1%7citem_page.featured_item_0%7citem_page.featured_item_1%7citem_page.featured_item_2%7citem_page.featured_item_3&u=mvbbr9wkg1pj7zehlfmnxwzrp4wgmedlg4m%3d&s=mvbbr9wkg1pj7zehlfmnxwzrp4wgmedlg4m%3d&cts=http%3a%2f%2fwww.barneys.com&chi=%7cmens-shirts-dress-classic&flv=18.0.0&rcs=ef4nyjeogcambdcfybs0obqfyg28bhrihnzu88v68sjxf881tdusq6hytimwomrgm9gkh9fpzo21oln3qbt3oguyocatzpgrp7a2emy&l=1' r_1= requests.get(url_1) pattern = re.compile(r'(?<=p=)[0-9]+(?=&)') product_ids = pattern.findall(str(r_1.content)) print ("details:- " + d3+';') print ("\nstyle id:- " + id_2+';') print ("\nrecommended product id's:- ") print (','.join(i in product_ids)) except: pass try: print ("\nurl:-" + img["src"]+';') except: pass try: print ("\nfull price:-" + item.find("span",{"class":"price-standard"}).text+';') except: pass try: print ("\ndiscounted price:-" + item.find("span",{"class":"price-sales"}).text+';') except: pass g_d2=soup.find_all("div", {"class":"color-scroll"}) pattern_1=re.compile("pid=(\w+)") item in g_d2: links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/sites-bny-site/default/product-variation')) link in links_1[1:]: match=pattern_1.search(link["href"]) if match: print ("\nproduct id of other color:-") print (match.group(1))
i added dictionary called d
import requests, re bs4 import beautifulsoup d={} url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2" r=requests.get(url) soup=beautifulsoup(r.content) links = soup.find_all("a") d["links"] = [] d["links"].append(("href", [link.get("href") link in links])) d["links"].append(("class", [link.get("class") link in links])) img=soup.find(itemprop="image") d["img"] = [] d["img"].append([("alt", img.get("alt")), ("src", img.get("src")), ("itemprop", img.get("itemprop")), ("class", img.get("class")[0])]) #you have put d["img"]["0"] instead of d["img"]["alt"] g_d4=soup.find_all("ol", {"class":"breadcrumb"}) item in g_d4: links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/')) pattern_2=re.compile("clothing/(\w+)") link in links_2: match_1=pattern_2.search(link["href"]) if match_1: print ("category:- " + match_1.group(1)) break g_d1 = soup.find_all("div", {"id": "product-content"}) item in g_d1: try: d["brand"] = item.contents[1].text print ("\n\nbrand:-" + item.contents[1].text) except: pass try: a_1=item.find("ol", {"class":"breadcrumb"}) a_2=a_1.text d["a_2"] = a_2 print a_2 except: pass try: print ("type:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';') d["type"] = item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text except: pass try: d2=item.find("div",{"class":"panel-body standard-p"}) d3=d2.text p_id=re.findall(r'[0-9]{9}',d3) id_2=p_id[0] url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7citem_page.rr1%7citem_page.featured_item_0%7citem_page.featured_item_1%7citem_page.featured_item_2%7citem_page.featured_item_3&u=mvbbr9wkg1pj7zehlfmnxwzrp4wgmedlg4m%3d&s=mvbbr9wkg1pj7zehlfmnxwzrp4wgmedlg4m%3d&cts=http%3a%2f%2fwww.barneys.com&chi=%7cmens-shirts-dress-classic&flv=18.0.0&rcs=ef4nyjeogcambdcfybs0obqfyg28bhrihnzu88v68sjxf881tdusq6hytimwomrgm9gkh9fpzo21oln3qbt3oguyocatzpgrp7a2emy&l=1' r_1= requests.get(url_1) pattern = re.compile(r'(?<=p=)[0-9]+(?=&)') product_ids = pattern.findall(str(r_1.content)) print ("details:- " + d3+';') d["details"] = d3.split(",") print ("\nstyle id:- " + id_2+';') d["style"] = ("id", id_2) print ("\nrecommended product id's:- ") print (','.join(i in product_ids)) d["recommendedproductids"] = [i in product_ids] except: pass try: print ("\nurl:-" + img["src"]+';') except: pass try: print ("\nfull price:-" + item.find("span",{"class":"price-standard"}).text+';') except: pass try: print ("\ndiscounted price:-" + item.find("span",{"class":"price-sales"}).text+';') except: pass g_d2=soup.find_all("div", {"class":"color-scroll"}) pattern_1=re.compile("pid=(\w+)") item in g_d2: links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/sites-bny-site/default/product-variation')) link in links_1[1:]: match=pattern_1.search(link["href"]) if match: print ("\nproduct id of other color:-") print (match.group(1))
Comments
Post a Comment