summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-01-09 19:20:45 +0200
committerGeorgi Gerganov <ggerganov@gmail.com>2024-01-09 19:21:13 +0200
commit9a818f7c42761984ac99e08e613cc20634f8410e (patch)
treef8bca94fc1dc109c91fda6d844d449e2b2015159
parent18adb4e9bb340b7b4565d8b6715b4449283e7641 (diff)
scripts : improve get-pg.sh (#4838)
-rwxr-xr-xscripts/get-pg.sh25
1 files changed, 24 insertions, 1 deletions
diff --git a/scripts/get-pg.sh b/scripts/get-pg.sh
index d516db46..b027793e 100755
--- a/scripts/get-pg.sh
+++ b/scripts/get-pg.sh
@@ -2,6 +2,22 @@
function usage {
echo "usage: <n>$0"
+ echo "note: n is the number of essays to download"
+ echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
+ echo "n | tokens"
+ echo "--- | ---"
+ echo "1 | 6230"
+ echo "2 | 23619"
+ echo "5 | 25859"
+ echo "10 | 36888"
+ echo "15 | 50188"
+ echo "20 | 59094"
+ echo "25 | 88764"
+ echo "30 | 103121"
+ echo "32 | 108338"
+ echo "35 | 113403"
+ echo "40 | 127699"
+ echo "45 | 135896"
exit 1
}
@@ -33,10 +49,17 @@ if [ -f pg.txt ]; then
rm pg.txt
fi
+c=1
for url in $urls; do
echo "processing $url"
- curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
+ cc=$(printf "%03d" $c)
+
+ curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
+ cat pg-$cc-one.txt >> pg.txt
+
+ cp -v pg.txt pg-$cc-all.txt
+ c=$((c+1))
# don't flood the server
sleep 1