Detalje mozete pozreti na adresi: https://en.wikipedia.org/wiki/Web_scraping" onclick="window.open(this.href);return false;
Elementi sa web strane se mogu lako identifikovati uz pomoc addona za Firefox SelectorGadget (SG) koji mozete skinuti sa adrese: http://www.selectorgadget.com/" onclick="window.open(this.href);return false;
Npr. ako zelimo da skidamo prognozu vremena za Beograd sa adrese http://www.weather2umbrella.com/sr/w2u- ... /eta/9/318" onclick="window.open(this.href);return false; uz pomoc SG odredimo XPath koji za prognozu glasi:
Code: Select all
#main div:nth-child(6)
Code: Select all
gem install mechanize
gem install optparse
Code: Select all
#!/usr/bin/env ruby
require 'mechanize'
require 'optparse'
options={}
optparse = OptionParser.new do |opts|
opts.banner = "Usage: scrap.rb [options] site xpath"
options[:string]=false
opts.on('-s','--string','Convert output to string') do
options[:string]=true
end
options[:text]=false
opts.on('-t','--text','Convert output to text') do
options[:text]=true
end
options[:html]=false
opts.on('-m','--html','Convert output to html') do
options[:html]=true
end
options[:xml]=false
opts.on('-x','--xml','Convert output to xml') do
options[:xml]=true
end
opts.on('-h','--help','Display this screen') do
puts opts
exit
end
end
optparse.parse!
agent=Mechanize.new
begin
site = ARGV[0]
xpath= ARGV[1]
scrapdata = agent.get(site).search(xpath)
puts scrapdata.to_s if options[:string]
puts scrapdata.inner_text if options[:text]
puts scrapdata.to_xml if options[:xml]
puts scrapdata.inner_html if options[:html]
rescue
puts optparse
end
Program snimimo pod nazivom webscrap.rb i u konzoli kucamo:
Code: Select all
chmod +x webscrap.rb
Code: Select all
./webscrap.rb -t http://www.weather2umbrella.com/sr/w2u-meteo-team-tekstualna-prognoza/eta/9/318 "#main div:nth-child(6)"
Jednostavno, zar ne?

Pozdrav