python 2.7 - How to use a for loop in Scrapy? -
i using scrapy project, in project extracting information xml.
in xml document format implement loop:
<relatedpersonslist> <relatedpersoninfo>...</relatedpersoninfo> <relatedpersoninfo> <relatedpersonname> <firstname>mark</firstname> <middlename>e.</middlename> <lastname>lucas</lastname> </relatedpersonname> <relatedpersonaddress> <street1>1 imation way</street1> <city>oakdale</city> <stateorcountry>mn</stateorcountry> <stateorcountrydescription>minnesota</stateorcountrydescription> <zipcode>55128</zipcode> </relatedpersonaddress> <relatedpersonrelationshiplist> <relationship>executive officer</relationship> <relationship>director</relationship> </relatedpersonrelationshiplist> <relationshipclarification/> </relatedpersoninfo> <relatedpersoninfo>...</relatedpersoninfo> <relatedpersoninfo>...</relatedpersoninfo> <relatedpersoninfo>...</relatedpersoninfo> <relatedpersoninfo>...</relatedpersoninfo> <relatedpersoninfo>...</relatedpersoninfo> </relatedpersonslist>
as can see in <relatedpersonslist>
, can have multiple <relatedpersoninfo>
, , when try make loop, still information of first person.
this actual code:
person in xxs.select('./relatedpersonslist/relatedpersoninfo'): item = myform() #even if rid of same result item["firstname"] = person.select('./relatedpersonname/firstname/text()').extract()[0] item["middlename"] = person.select('./relatedpersonname/middlename/text()') if item["middlename"]: item["middlename"] = item["middlename"].extract()[0] else: item["middlename"] = "na"
here code used on spider:
from scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors.sgml import sgmllinkextractor scrapy.selector import htmlxpathselector scrapy.selector import xmlxpathselector scrapy.http import request import urlparse formds.items import secformd class secdform(crawlspider): name = "dform" allowed_domain = ["http://www..gov"] start_urls = [ "" ] rules = ( rule( sgmllinkextractor(restrict_xpaths=["/html/body/div/table/tr/td[3]/a[2]"]), callback='parse_formd', #follow= true no need of follow thing ), rule( sgmllinkextractor(restrict_xpaths=('/html/body/div/center[1]/a[contains(., "[next]")]')), follow=true ), ) def parse_formd(self, response): hxs = htmlxpathselector(response) sites = hxs.select('//*[@id="formdiv"]/div/table/tr[3]/td[3]/a/@href').extract() site in sites: yield request(url=urlparse.urljoin(response.url, site), callback=self.parse_xml_document) def parse_xml_document(self, response): xxs = xmlxpathselector(response) item = secformd() item["stateorcountrydescription"] = xxs.select('./primaryissuer/issueraddress/stateorcountrydescription/text()').extract()[0] item["zipcode"] = xxs.select('./primaryissuer/issueraddress/zipcode/text()').extract()[0] item["issuerphonenumber"] = xxs.select('./primaryissuer/issuerphonenumber/text()').extract()[0] person in xxs.select('./relatedpersonslist//relatedpersoninfo'): #item = secdform() item["firstname"] = person.select('./relatedpersonname/firstname/text()').extract()[0] item["middlename"] = person.select('./relatedpersonname/middlename/text()') if item["middlename"]: item["middlename"] = item["middlename"].extract()[0] else: item["middlename"] = "na" return item
i extract information .json file using command: scrapy crawl dform -o tes4.json -t json
try this:
def parse_xml_document(self, response): xxs = xmlxpathselector(response) items = [] # common field values stateorcountrydescription = xxs.select('./primaryissuer/issueraddress/stateorcountrydescription/text()').extract()[0] zipcode = xxs.select('./primaryissuer/issueraddress/zipcode/text()').extract()[0] issuerphonenumber = xxs.select('./primaryissuer/issuerphonenumber/text()').extract()[0] person in xxs.select('./relatedpersonslist//relatedpersoninfo'): # instantiate 1 item per loop iteration item = secformd() # save common parameters item["stateorcountrydescription"] = stateorcountrydescription item["zipcode"] = zipcode item["issuerphonenumber"] = issuerphonenumber item["firstname"] = person.select('./relatedpersonname/firstname/text()').extract()[0] item["middlename"] = person.select('./relatedpersonname/middlename/text()') if item["middlename"]: item["middlename"] = item["middlename"].extract()[0] else: item["middlename"] = "na" items.append(item) return items
Comments
Post a Comment