From c0f170462209484074470d693af44c65e5a252ac Mon Sep 17 00:00:00 2001 From: Skip Montanaro Date: Tue, 16 Feb 2021 20:17:05 -0600 Subject: simple script to extract (most of) the original shell archive from the Google Groups HTML --- shar/sharify.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 shar/sharify.py (limited to 'shar') diff --git a/shar/sharify.py b/shar/sharify.py new file mode 100644 index 0000000..42ec1c4 --- /dev/null +++ b/shar/sharify.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +"""A better reconstitution of the original shell archives from the Google HTML files. + +Example (for part 09): + +curl https://groups.google.com/g/alt.sources/c/w0LgGPVB6f0/m/SDnD377as9IJ \ +| egrep '
' \ +| sed -e 's/.*
//' -e 's:
.*::' \ +| python shar/sharify.py > shar/python-0.9.1-09-21.shar +""" + +import html +import re +import sys + +shar = sys.stdin.read() + +for (pat, repl) in ( + (r'X\t', r'X '), + (r'X \t', r'X '), + (r'X \t', r'X '), + (r'X \t', r'X '), + ): + shar = shar.replace(pat, repl) + +for (pat, repl) in ( + (',[2,[[1,[null,"', ''), + (r'\u0026lt;\u003c', '<'), + (r'\u003e\u0026gt;', '>'), + (r'\u003c', '<'), + (r'\u003d', '='), + (r'\u003e', '>'), + (r'\u0026', '&'), + (r'\"', '"'), + (r'"]', ''), +): + shar = shar.replace(pat, repl) + +shar = html.unescape(shar) + +for (pat, repl) in ( + ('
', '\n'), + ): + shar = shar.replace(pat, repl) + +for (pat, repl) in ( + (r'([^<]+)', + r'\1'), + (r'([^<]+)', r'\1'), + ): + shar = re.sub(pat, repl, shar) + +sys.stdout.write(shar.rstrip()) -- cgit v1.2.3