Specific Shift_JIS code page: CP932.
#!/usr/bin/env ruby require 'nokogiri' require 'open-uri' require 'csv' url = 'https://example.com/foo.html' html = open(url) do |f| f.read end doc = Nokogiri::HTML.parse(html, nil, "CP932") doc.xpath('//a').each do |node| url = node["href"] title = node.content puts "#{title}\t#{url}" end
read from command line argument.
#!/usr/bin/env ruby require 'nokogiri' require 'open-uri' require 'csv' url = ARGV[0] html = open(url) do |f| f.read end doc = Nokogiri::HTML.parse(html, nil, "CP932") doc.xpath('//a').each do |node| url = node["href"] title = node.content puts "#{title}\t#{url}" end
call following:
bundle exec scrape.rb https://example.com/foo.html