Scrapy to download specific type files
I'm new to scrapy and python, I can able to download all the files but I want to download only specific Type files "EX-10", So that it will download followinh files. ( Ex-10.1, Ex-10.2 to EX-10.99).
My Code
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
and Scrapy want to check for next pages also... (upto last page), Its not working Fine.
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@value="Next 40"]',)), callback="parse", follow= True),)
# follow next page links
next_page = response.xpath('.//a[@value="Next 40"]/@href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
request = scrapy.Request(url=next_page_url)
yield request
python web-scraping scrapy web-crawler scrapy-spider
|
show 3 more comments
I'm new to scrapy and python, I can able to download all the files but I want to download only specific Type files "EX-10", So that it will download followinh files. ( Ex-10.1, Ex-10.2 to EX-10.99).
My Code
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
and Scrapy want to check for next pages also... (upto last page), Its not working Fine.
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@value="Next 40"]',)), callback="parse", follow= True),)
# follow next page links
next_page = response.xpath('.//a[@value="Next 40"]/@href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
request = scrapy.Request(url=next_page_url)
yield request
python web-scraping scrapy web-crawler scrapy-spider
Do you have an example link where any of these Ex-10.1, Ex-10.2 to EX-10.99 are found? And do they still need the .html or .txt file mask?
– QHarr
Nov 16 '18 at 11:57
Thanks for your responce Qharr, Check this link in this there more around 8 files is there. but I want only (EX-10.46, EX-10.47, 10.48).... same like this i want to check all files and download only this specif type filr EX-10 (so that it will download(Ex-10.1, EX-10.2 to EX-10.99). Ans It will be better If i get all the files in pdf format.
– Vinod kumar
Nov 16 '18 at 12:17
I get page not found 404 And by .txt and .htm I mean do you still want to filter on those in the format column?
– QHarr
Nov 16 '18 at 12:18
sec.gov/Archives/edgar/data/914201/000089924302000881/…
– Vinod kumar
Nov 16 '18 at 12:21
Yes for filter.
– Vinod kumar
Nov 16 '18 at 12:22
|
show 3 more comments
I'm new to scrapy and python, I can able to download all the files but I want to download only specific Type files "EX-10", So that it will download followinh files. ( Ex-10.1, Ex-10.2 to EX-10.99).
My Code
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
and Scrapy want to check for next pages also... (upto last page), Its not working Fine.
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@value="Next 40"]',)), callback="parse", follow= True),)
# follow next page links
next_page = response.xpath('.//a[@value="Next 40"]/@href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
request = scrapy.Request(url=next_page_url)
yield request
python web-scraping scrapy web-crawler scrapy-spider
I'm new to scrapy and python, I can able to download all the files but I want to download only specific Type files "EX-10", So that it will download followinh files. ( Ex-10.1, Ex-10.2 to EX-10.99).
My Code
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
and Scrapy want to check for next pages also... (upto last page), Its not working Fine.
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@value="Next 40"]',)), callback="parse", follow= True),)
# follow next page links
next_page = response.xpath('.//a[@value="Next 40"]/@href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
request = scrapy.Request(url=next_page_url)
yield request
python web-scraping scrapy web-crawler scrapy-spider
python web-scraping scrapy web-crawler scrapy-spider
edited Nov 16 '18 at 11:39
Vinod kumar
asked Nov 16 '18 at 11:32
Vinod kumarVinod kumar
208
208
Do you have an example link where any of these Ex-10.1, Ex-10.2 to EX-10.99 are found? And do they still need the .html or .txt file mask?
– QHarr
Nov 16 '18 at 11:57
Thanks for your responce Qharr, Check this link in this there more around 8 files is there. but I want only (EX-10.46, EX-10.47, 10.48).... same like this i want to check all files and download only this specif type filr EX-10 (so that it will download(Ex-10.1, EX-10.2 to EX-10.99). Ans It will be better If i get all the files in pdf format.
– Vinod kumar
Nov 16 '18 at 12:17
I get page not found 404 And by .txt and .htm I mean do you still want to filter on those in the format column?
– QHarr
Nov 16 '18 at 12:18
sec.gov/Archives/edgar/data/914201/000089924302000881/…
– Vinod kumar
Nov 16 '18 at 12:21
Yes for filter.
– Vinod kumar
Nov 16 '18 at 12:22
|
show 3 more comments
Do you have an example link where any of these Ex-10.1, Ex-10.2 to EX-10.99 are found? And do they still need the .html or .txt file mask?
– QHarr
Nov 16 '18 at 11:57
Thanks for your responce Qharr, Check this link in this there more around 8 files is there. but I want only (EX-10.46, EX-10.47, 10.48).... same like this i want to check all files and download only this specif type filr EX-10 (so that it will download(Ex-10.1, EX-10.2 to EX-10.99). Ans It will be better If i get all the files in pdf format.
– Vinod kumar
Nov 16 '18 at 12:17
I get page not found 404 And by .txt and .htm I mean do you still want to filter on those in the format column?
– QHarr
Nov 16 '18 at 12:18
sec.gov/Archives/edgar/data/914201/000089924302000881/…
– Vinod kumar
Nov 16 '18 at 12:21
Yes for filter.
– Vinod kumar
Nov 16 '18 at 12:22
Do you have an example link where any of these Ex-10.1, Ex-10.2 to EX-10.99 are found? And do they still need the .html or .txt file mask?
– QHarr
Nov 16 '18 at 11:57
Do you have an example link where any of these Ex-10.1, Ex-10.2 to EX-10.99 are found? And do they still need the .html or .txt file mask?
– QHarr
Nov 16 '18 at 11:57
Thanks for your responce Qharr, Check this link in this there more around 8 files is there. but I want only (EX-10.46, EX-10.47, 10.48).... same like this i want to check all files and download only this specif type filr EX-10 (so that it will download(Ex-10.1, EX-10.2 to EX-10.99). Ans It will be better If i get all the files in pdf format.
– Vinod kumar
Nov 16 '18 at 12:17
Thanks for your responce Qharr, Check this link in this there more around 8 files is there. but I want only (EX-10.46, EX-10.47, 10.48).... same like this i want to check all files and download only this specif type filr EX-10 (so that it will download(Ex-10.1, EX-10.2 to EX-10.99). Ans It will be better If i get all the files in pdf format.
– Vinod kumar
Nov 16 '18 at 12:17
I get page not found 404 And by .txt and .htm I mean do you still want to filter on those in the format column?
– QHarr
Nov 16 '18 at 12:18
I get page not found 404 And by .txt and .htm I mean do you still want to filter on those in the format column?
– QHarr
Nov 16 '18 at 12:18
sec.gov/Archives/edgar/data/914201/000089924302000881/…
– Vinod kumar
Nov 16 '18 at 12:21
sec.gov/Archives/edgar/data/914201/000089924302000881/…
– Vinod kumar
Nov 16 '18 at 12:21
Yes for filter.
– Vinod kumar
Nov 16 '18 at 12:22
Yes for filter.
– Vinod kumar
Nov 16 '18 at 12:22
|
show 3 more comments
2 Answers
2
active
oldest
votes
Your problem seems to have been solved. The following script should fetch you the required files from that site following every pagination links and downloading those files like the way you wanted to have.
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
Thanks for the answer SIM, I can see that All EX files have been downloading, But I need only EX-10 (so that It will download EX-10.1, EX-10.2 to EX-10.99).., SO I have to add like this for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-10")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract(): AM I RITE ?
– Vinod kumar
Nov 17 '18 at 14:49
Yeah, it seems so.
– SIM
Nov 17 '18 at 15:28
add a comment |
You need to use a FilesPipeline
, but the one that scrapy provides generates the file name based on the hash of the URL.
If you want a custom file name, you have to make your own FilesPipeline
like this:
import scrapy, os
from scrapy.pipelines.files import FilesPipeline
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
custom_settings = {
'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
yield {
'file_urls': [response.urljoin(links)]
}
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
Thanks for your answer Guillaume, I have some doubt, Where you have mentioned the fileTypes, I have to download some specify type files. Example (Ex-10)
– Vinod kumar
Nov 17 '18 at 5:58
You can filter you the file names that you pass in thefile_urls
for exampleif 'Ex-10' in links
: yield...`
– Guillaume
Nov 17 '18 at 6:02
you telling that, I want to add EX-10 in this line, EXAMPLE : yield { 'file_urls': [response.urljoin(EX-10)] }. Am I rite ? or want to change something ?
– Vinod kumar
Nov 17 '18 at 6:17
Do you have an example URL of such file?
– Guillaume
Nov 17 '18 at 6:21
Yes, Check this link, link In this there are a lot of files, but I have to download only EX-10.46, EX-10.47, EX-10.48. same like each link have like this...
– Vinod kumar
Nov 17 '18 at 7:08
|
show 2 more comments
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53337018%2fscrapy-to-download-specific-type-files%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
Your problem seems to have been solved. The following script should fetch you the required files from that site following every pagination links and downloading those files like the way you wanted to have.
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
Thanks for the answer SIM, I can see that All EX files have been downloading, But I need only EX-10 (so that It will download EX-10.1, EX-10.2 to EX-10.99).., SO I have to add like this for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-10")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract(): AM I RITE ?
– Vinod kumar
Nov 17 '18 at 14:49
Yeah, it seems so.
– SIM
Nov 17 '18 at 15:28
add a comment |
Your problem seems to have been solved. The following script should fetch you the required files from that site following every pagination links and downloading those files like the way you wanted to have.
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
Thanks for the answer SIM, I can see that All EX files have been downloading, But I need only EX-10 (so that It will download EX-10.1, EX-10.2 to EX-10.99).., SO I have to add like this for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-10")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract(): AM I RITE ?
– Vinod kumar
Nov 17 '18 at 14:49
Yeah, it seems so.
– SIM
Nov 17 '18 at 15:28
add a comment |
Your problem seems to have been solved. The following script should fetch you the required files from that site following every pagination links and downloading those files like the way you wanted to have.
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
Your problem seems to have been solved. The following script should fetch you the required files from that site following every pagination links and downloading those files like the way you wanted to have.
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
answered Nov 17 '18 at 11:29
SIMSIM
10.8k31148
10.8k31148
Thanks for the answer SIM, I can see that All EX files have been downloading, But I need only EX-10 (so that It will download EX-10.1, EX-10.2 to EX-10.99).., SO I have to add like this for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-10")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract(): AM I RITE ?
– Vinod kumar
Nov 17 '18 at 14:49
Yeah, it seems so.
– SIM
Nov 17 '18 at 15:28
add a comment |
Thanks for the answer SIM, I can see that All EX files have been downloading, But I need only EX-10 (so that It will download EX-10.1, EX-10.2 to EX-10.99).., SO I have to add like this for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-10")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract(): AM I RITE ?
– Vinod kumar
Nov 17 '18 at 14:49
Yeah, it seems so.
– SIM
Nov 17 '18 at 15:28
Thanks for the answer SIM, I can see that All EX files have been downloading, But I need only EX-10 (so that It will download EX-10.1, EX-10.2 to EX-10.99).., SO I have to add like this for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-10")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract(): AM I RITE ?
– Vinod kumar
Nov 17 '18 at 14:49
Thanks for the answer SIM, I can see that All EX files have been downloading, But I need only EX-10 (so that It will download EX-10.1, EX-10.2 to EX-10.99).., SO I have to add like this for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-10")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract(): AM I RITE ?
– Vinod kumar
Nov 17 '18 at 14:49
Yeah, it seems so.
– SIM
Nov 17 '18 at 15:28
Yeah, it seems so.
– SIM
Nov 17 '18 at 15:28
add a comment |
You need to use a FilesPipeline
, but the one that scrapy provides generates the file name based on the hash of the URL.
If you want a custom file name, you have to make your own FilesPipeline
like this:
import scrapy, os
from scrapy.pipelines.files import FilesPipeline
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
custom_settings = {
'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
yield {
'file_urls': [response.urljoin(links)]
}
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
Thanks for your answer Guillaume, I have some doubt, Where you have mentioned the fileTypes, I have to download some specify type files. Example (Ex-10)
– Vinod kumar
Nov 17 '18 at 5:58
You can filter you the file names that you pass in thefile_urls
for exampleif 'Ex-10' in links
: yield...`
– Guillaume
Nov 17 '18 at 6:02
you telling that, I want to add EX-10 in this line, EXAMPLE : yield { 'file_urls': [response.urljoin(EX-10)] }. Am I rite ? or want to change something ?
– Vinod kumar
Nov 17 '18 at 6:17
Do you have an example URL of such file?
– Guillaume
Nov 17 '18 at 6:21
Yes, Check this link, link In this there are a lot of files, but I have to download only EX-10.46, EX-10.47, EX-10.48. same like each link have like this...
– Vinod kumar
Nov 17 '18 at 7:08
|
show 2 more comments
You need to use a FilesPipeline
, but the one that scrapy provides generates the file name based on the hash of the URL.
If you want a custom file name, you have to make your own FilesPipeline
like this:
import scrapy, os
from scrapy.pipelines.files import FilesPipeline
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
custom_settings = {
'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
yield {
'file_urls': [response.urljoin(links)]
}
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
Thanks for your answer Guillaume, I have some doubt, Where you have mentioned the fileTypes, I have to download some specify type files. Example (Ex-10)
– Vinod kumar
Nov 17 '18 at 5:58
You can filter you the file names that you pass in thefile_urls
for exampleif 'Ex-10' in links
: yield...`
– Guillaume
Nov 17 '18 at 6:02
you telling that, I want to add EX-10 in this line, EXAMPLE : yield { 'file_urls': [response.urljoin(EX-10)] }. Am I rite ? or want to change something ?
– Vinod kumar
Nov 17 '18 at 6:17
Do you have an example URL of such file?
– Guillaume
Nov 17 '18 at 6:21
Yes, Check this link, link In this there are a lot of files, but I have to download only EX-10.46, EX-10.47, EX-10.48. same like each link have like this...
– Vinod kumar
Nov 17 '18 at 7:08
|
show 2 more comments
You need to use a FilesPipeline
, but the one that scrapy provides generates the file name based on the hash of the URL.
If you want a custom file name, you have to make your own FilesPipeline
like this:
import scrapy, os
from scrapy.pipelines.files import FilesPipeline
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
custom_settings = {
'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
yield {
'file_urls': [response.urljoin(links)]
}
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
You need to use a FilesPipeline
, but the one that scrapy provides generates the file name based on the hash of the URL.
If you want a custom file name, you have to make your own FilesPipeline
like this:
import scrapy, os
from scrapy.pipelines.files import FilesPipeline
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
custom_settings = {
'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
yield {
'file_urls': [response.urljoin(links)]
}
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
answered Nov 16 '18 at 17:50
GuillaumeGuillaume
1,1681724
1,1681724
Thanks for your answer Guillaume, I have some doubt, Where you have mentioned the fileTypes, I have to download some specify type files. Example (Ex-10)
– Vinod kumar
Nov 17 '18 at 5:58
You can filter you the file names that you pass in thefile_urls
for exampleif 'Ex-10' in links
: yield...`
– Guillaume
Nov 17 '18 at 6:02
you telling that, I want to add EX-10 in this line, EXAMPLE : yield { 'file_urls': [response.urljoin(EX-10)] }. Am I rite ? or want to change something ?
– Vinod kumar
Nov 17 '18 at 6:17
Do you have an example URL of such file?
– Guillaume
Nov 17 '18 at 6:21
Yes, Check this link, link In this there are a lot of files, but I have to download only EX-10.46, EX-10.47, EX-10.48. same like each link have like this...
– Vinod kumar
Nov 17 '18 at 7:08
|
show 2 more comments
Thanks for your answer Guillaume, I have some doubt, Where you have mentioned the fileTypes, I have to download some specify type files. Example (Ex-10)
– Vinod kumar
Nov 17 '18 at 5:58
You can filter you the file names that you pass in thefile_urls
for exampleif 'Ex-10' in links
: yield...`
– Guillaume
Nov 17 '18 at 6:02
you telling that, I want to add EX-10 in this line, EXAMPLE : yield { 'file_urls': [response.urljoin(EX-10)] }. Am I rite ? or want to change something ?
– Vinod kumar
Nov 17 '18 at 6:17
Do you have an example URL of such file?
– Guillaume
Nov 17 '18 at 6:21
Yes, Check this link, link In this there are a lot of files, but I have to download only EX-10.46, EX-10.47, EX-10.48. same like each link have like this...
– Vinod kumar
Nov 17 '18 at 7:08
Thanks for your answer Guillaume, I have some doubt, Where you have mentioned the fileTypes, I have to download some specify type files. Example (Ex-10)
– Vinod kumar
Nov 17 '18 at 5:58
Thanks for your answer Guillaume, I have some doubt, Where you have mentioned the fileTypes, I have to download some specify type files. Example (Ex-10)
– Vinod kumar
Nov 17 '18 at 5:58
You can filter you the file names that you pass in the
file_urls
for example if 'Ex-10' in links
: yield...`– Guillaume
Nov 17 '18 at 6:02
You can filter you the file names that you pass in the
file_urls
for example if 'Ex-10' in links
: yield...`– Guillaume
Nov 17 '18 at 6:02
you telling that, I want to add EX-10 in this line, EXAMPLE : yield { 'file_urls': [response.urljoin(EX-10)] }. Am I rite ? or want to change something ?
– Vinod kumar
Nov 17 '18 at 6:17
you telling that, I want to add EX-10 in this line, EXAMPLE : yield { 'file_urls': [response.urljoin(EX-10)] }. Am I rite ? or want to change something ?
– Vinod kumar
Nov 17 '18 at 6:17
Do you have an example URL of such file?
– Guillaume
Nov 17 '18 at 6:21
Do you have an example URL of such file?
– Guillaume
Nov 17 '18 at 6:21
Yes, Check this link, link In this there are a lot of files, but I have to download only EX-10.46, EX-10.47, EX-10.48. same like each link have like this...
– Vinod kumar
Nov 17 '18 at 7:08
Yes, Check this link, link In this there are a lot of files, but I have to download only EX-10.46, EX-10.47, EX-10.48. same like each link have like this...
– Vinod kumar
Nov 17 '18 at 7:08
|
show 2 more comments
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53337018%2fscrapy-to-download-specific-type-files%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Do you have an example link where any of these Ex-10.1, Ex-10.2 to EX-10.99 are found? And do they still need the .html or .txt file mask?
– QHarr
Nov 16 '18 at 11:57
Thanks for your responce Qharr, Check this link in this there more around 8 files is there. but I want only (EX-10.46, EX-10.47, 10.48).... same like this i want to check all files and download only this specif type filr EX-10 (so that it will download(Ex-10.1, EX-10.2 to EX-10.99). Ans It will be better If i get all the files in pdf format.
– Vinod kumar
Nov 16 '18 at 12:17
I get page not found 404 And by .txt and .htm I mean do you still want to filter on those in the format column?
– QHarr
Nov 16 '18 at 12:18
sec.gov/Archives/edgar/data/914201/000089924302000881/…
– Vinod kumar
Nov 16 '18 at 12:21
Yes for filter.
– Vinod kumar
Nov 16 '18 at 12:22