diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-01-09 16:23:05 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-09 16:23:05 +0200 |
commit | d9653894dffbfd3a58616f31b0967b34faf6f611 (patch) | |
tree | a81a62d7f320cd0caacd801f6878679744e80a16 | |
parent | 128de3585b0f58b1e562733448fc00109f23a95d (diff) |
scripts : script to get Paul Graham essays in txt format (#4838)
-rwxr-xr-x | scripts/get-pg.sh | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/scripts/get-pg.sh b/scripts/get-pg.sh new file mode 100755 index 00000000..d516db46 --- /dev/null +++ b/scripts/get-pg.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +function usage { + echo "usage: <n>$0" + exit 1 +} + +function has_cmd { + if ! [ -x "$(command -v $1)" ]; then + echo "error: $1 is not available" >&2 + exit 1 + fi +} + +# check for: curl, html2text, tail, sed, fmt +has_cmd curl +has_cmd html2text +has_cmd tail +has_cmd sed + +if [ $# -ne 1 ]; then + usage +fi + +n=$1 + +# get urls +urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)" + +printf "urls:\n%s\n" "$urls" + +if [ -f pg.txt ]; then + rm pg.txt +fi + +for url in $urls; do + echo "processing $url" + + curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt + + # don't flood the server + sleep 1 +done + +echo "done. data in pg.txt" + +exit 0 |