#! /usr/bin/env python3 from urllib.parse import urlsplit, parse_qsl, quote, quote_plus, urlencode from urllib.request import urlretrieve import os import socket import sys import urllib.request url = sys.argv[1] split_results = urlsplit(url) protocol, netloc, path, query, tag = split_results user = split_results.username password = split_results.password host = split_results.hostname port = split_results.port qlist = parse_qsl(query) url = protocol + "://" if user: url += quote(user) if password: url += ':' + quote(password) url += '@' if host: host = host.encode('idna').decode('ascii') url += host if port: url += ':%d' % port if path: if protocol == "file": url += quote(path) else: url += quote(path) if query: url += '?' + urlencode(qlist) if tag: url += '#' + quote_plus(tag) # I remember seeing some sites that return broken HTML or even HTTP response # without "compatible" user agent; I don't know if such sites are still around, # but this header doesn't cause any harm so I'd better continue to use it. # UPDATE: I saw a number of sites that forbid "Mozilla compatible" urllib_version = urllib.request.__version__ server_version = "Python-urllib/%s" % urllib_version class MyURLopener(urllib.request.URLopener): def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): return urllib.request.URLopener.open(self, fullurl, data) _urlopener = urllib.request._opener = MyURLopener() _urlopener.addheaders[0] = ('User-agent', server_version) _urlopener.addheaders.append(('Accept-Charset', "koi8-r;q=1.0")) dest_file = os.path.basename(url) if not dest_file: dest_file = "_index.html" filename, headers = urlretrieve(url, filename=dest_file) if "last-modified" in headers: from m_lib.net.www.util import parse_time last_modified = parse_time(headers["last-modified"]) if last_modified: os.utime(dest_file, (last_modified, last_modified))