summaryrefslogtreecommitdiff
path: root/scripts/get-pg.sh
blob: d516db46cf01fef3d497ddf61e441eabac3de634 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash

function usage {
    echo "usage: <n>$0"
    exit 1
}

function has_cmd {
    if ! [ -x "$(command -v $1)" ]; then
        echo "error: $1 is not available" >&2
        exit 1
    fi
}

# check for: curl, html2text, tail, sed, fmt
has_cmd curl
has_cmd html2text
has_cmd tail
has_cmd sed

if [ $# -ne 1 ]; then
    usage
fi

n=$1

# get urls
urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"

printf "urls:\n%s\n" "$urls"

if [ -f pg.txt ]; then
    rm pg.txt
fi

for url in $urls; do
    echo "processing $url"

    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt

    # don't flood the server
    sleep 1
done

echo "done. data in pg.txt"

exit 0