diff options
| author | Skip Montanaro <[email protected]> | 2021-02-16 20:17:05 -0600 |
|---|---|---|
| committer | Skip Montanaro <[email protected]> | 2021-02-16 20:17:05 -0600 |
| commit | c0f170462209484074470d693af44c65e5a252ac (patch) | |
| tree | 9ad1cd3b7768069e4518befda5d496b33d3a95f3 | |
| parent | 827843405f67b88e62380846eb96969b389117d5 (diff) | |
| download | python-0.9.1-patched-QoL-c0f170462209484074470d693af44c65e5a252ac.tar.xz python-0.9.1-patched-QoL-c0f170462209484074470d693af44c65e5a252ac.zip | |
simple script to extract (most of) the original shell archive from the Google Groups HTML
| -rw-r--r-- | shar/sharify.py | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/shar/sharify.py b/shar/sharify.py new file mode 100644 index 0000000..42ec1c4 --- /dev/null +++ b/shar/sharify.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +"""A better reconstitution of the original shell archives from the Google HTML files. + +Example (for part 09): + +curl https://groups.google.com/g/alt.sources/c/w0LgGPVB6f0/m/SDnD377as9IJ \ +| egrep '<section>' \ +| sed -e 's/.*<section>//' -e 's:</section>.*::' \ +| python shar/sharify.py > shar/python-0.9.1-09-21.shar +""" + +import html +import re +import sys + +shar = sys.stdin.read() + +for (pat, repl) in ( + (r'X\t', r'X '), + (r'X \t', r'X '), + (r'X \t', r'X '), + (r'X \t', r'X '), + ): + shar = shar.replace(pat, repl) + +for (pat, repl) in ( + (',[2,[[1,[null,"', ''), + (r'\u0026lt;\u003c', '<'), + (r'\u003e\u0026gt;', '>'), + (r'\u003c', '<'), + (r'\u003d', '='), + (r'\u003e', '>'), + (r'\u0026', '&'), + (r'\"', '"'), + (r'"]', ''), +): + shar = shar.replace(pat, repl) + +shar = html.unescape(shar) + +for (pat, repl) in ( + ('<br>', '\n'), + ): + shar = shar.replace(pat, repl) + +for (pat, repl) in ( + (r'<a href="[^"]+" target="_blank" rel="nofollow"' + r' data-saferedirecturl="[^"]+">([^<]+)</a>', + r'\1'), + (r'<a href data-email-masked rel="nofollow">([^<]+)</a>', r'\1'), + ): + shar = re.sub(pat, repl, shar) + +sys.stdout.write(shar.rstrip()) |
