autodevstats/statstool at master · gerner/autodevstats · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
#!/bin/bash

#allow passing some environment vairables to override some automated steps
if [ -z "$DATADIR" ]; then
    DATADIR=""
fi

if [ -z "$ORIGIN_URL" ]; then
    ORIGIN_URL=""
fi

if [ -z "$DEFAULT_BRANCH" ]; then
    DEFAULT_BRANCH=""
fi

if [ -z "$FAIL_ON_RENAME" ]; then
    FAIL_ON_RENAME=false
fi

if [ -z "$SKIP_ANALYSIS" ]; then
    SKIP_ANALYSIS=false
fi

set -eu -o pipefail

# cross-OS compatibility (greadlink, gsed, gzcat are GNU implementations for OS X)
[[ $(uname) == 'Darwin' ]] && {
    shopt -s expand_aliases
    which greadlink gsed gzcat gjoin gmktemp gdate gwc > /dev/null && {
        unalias readlink sed zcat join mktemp date wc >/dev/null 2>/dev/null
        alias readlink=greadlink sed=gsed zcat=gzcat join=gjoin mktemp=gmktemp date=gdate wc=gwc
    } || {
        echo 'ERROR: GNU utils required for Mac. You may use homebrew to install them: brew install coreutils gnu-sed'
        exit 1
    }
}

PR_SAMPLESIZE=250
COMMIT_PR_SAMPLESIZE=200

#an outer bound for how many pages of PRs we'll try to fetch
#this is enough to hopefully capture ~1 year of PRs even for very active repos
#the vast majority (>95%) of OSS benchmark repos are captured by just 20 pages
MAX_ALL_PR_PAGES=60
#an outer bound for how much data we'll pull from github, 3 years
MAX_SPAN_DAYS=730
#we'll run sets of time-bound analyses:
SPAN_DAYS="183 365 730 1460"
#SPAN_DAYS=365

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

echo "this tool requires several other tools to be installed."
echo "if they are not the dependency check that follows will fail."
echo "see README.md in the same folder as statstool for details."
echo
echo "checking for dependencies..."
echo "checking for bash..."
bash --version 2>&1 | grep 'bash'
echo "checking for gzip..."
gzip --version 2>&1 | grep 'gzip'
echo "checking for zcat..."
zcat --version 2>&1 | grep 'gzip'
echo "checking for curl..."
curl --version | grep 'curl 7\.[0-9]'
echo "checking for git..."
git --version | grep 'git version [0-9]'
echo "checking for sed..."
sed --version | grep "sed.* 4\."
echo "checking for awk..."
gawk --version | grep 'GNU Awk [0-9]'
echo "checking for join..."
join --version | grep 'join (GNU coreutils) [0-9]'
echo "checking for pv..."
pv --version | grep 'pv [0-9]'
echo "checking for ag..."
ag --version | grep 'ag version [0-9]'
echo "checking for mktemp..."
mktemp --version | grep 'mktemp'
echo "checking for date..."
date --version | grep 'GNU coreutils'
echo "checking for wc..."
wc --version | grep 'GNU coreutils'

echo "running a quick test..."
testfile=$(mktemp tmp.XXXXXXXXXX.gz)
echo "hello" | gzip -c > $testfile
zcat $testfile | sed 's/hello/hiya/' | gawk '/hiya/ { printf("{\"message\":\"sed gawk pv jq and ag all work\"}\n")}' | pv -s 1 -l | jq '.message' | ag 'sed gawk pv jq and ag all work'
zcat $testfile | wc -l | ag '^1$'
rm $testfile

test_date="$(date --utc)"
date -d@$(date -d "${test_date}" +%s) --utc | grep "${test_date}"

echo "dependencies look good"
echo

echo "we need to gather some data before we can start computing some stats"
echo "first we'll process some local commit data using things like git log and git show"
echo "second we'll download some data from the GitHub like pulls, issues and comments"
echo "we are only going to read data from your repo and GitHub"
echo "all of this can take 15-30 minutes, so sit back and relax"
echo

#if we aren't just computing stats on a predfined datadir
if [ -z "${DATADIR}" ] || [ ! -e "${DATADIR}" ] ; then
    #get to toplevel of the repo which some steps below assume we are at
    cd $(git rev-parse --show-toplevel)

    ORIGIN_REMOTE="origin"

    if [ -z "$ORIGIN_URL" ]; then
        #try to autodetect the github repo
        echo "autodetecting the origin remote url"
        echo

        ORIGIN_URL="$(git config --get-regex remote.${ORIGIN_REMOTE}.url | gawk '{print $2}')" || true

        if [ -z "$ORIGIN_URL" ] || ! echo "$ORIGIN_URL" | grep -E '^(git@|https:\/\/)([^:@/]*(:[^:@]*)?@)?github.com[:\/]([^\/]*\/[^\/]*)(\.git)?$' > /dev/null; then
            echo "it looks like you don't have an origin remote, or it's not pointing to a github repository."
            echo "you can provide your own github repository, check out the README"
            exit 1
        fi
    else
        #let the user provide their own origin url
        echo "using provided origin url ${ORIGIN_URL}"
        echo

        if ! echo "$ORIGIN_URL" | grep -E '^(git@|https:\/\/)github.com[:\/]([^\/]*\/[^\/]*)(\.git)?$' > /dev/null; then
            echo "it looks like the origin url you provided (${ORIGIN_URL}) isn't a github repository url."
            echo "check the README.md or contact AutoDev folks for more assistance."
            exit 1
        fi

    fi

    REPO=$(echo "$ORIGIN_URL" | sed 's/\.git$//' | sed -E '/^(git@|https:\/\/([^:@/]*(:[^:@]*)?@)?)github.com[:\/]/!{q1}; {s/.*github.com[:\/]([^\/]*\/[^\]*)$/\1/}')

    echo "using remote url ${ORIGIN_URL}"
    echo "this corresponds to the github repository at https://github.com/${REPO}"
    echo "this is the repository whose pull requests and issues we'll analyze"
    echo

    REPO_FNSAFE=$(echo "$REPO" | sed 's/\//_/')
    if [ -z "${DATADIR}" ]; then
        echo "choosing new directory to hold temp data"
        DATADIR=$(mktemp --directory --tmpdir autodevstats.${REPO_FNSAFE}.XXXXXXXXXX)
    else
        echo "using given path to hold temp data"
        mkdir "${DATADIR}"
    fi
    echo "we'll write data to ${DATADIR}"
    echo

    #truncate timing data
    > ${DATADIR}/times.tsv

    #record when we start pulling data
    #this can be used as a good timestamp for when data collection happened
    data_pull_start_time=$(date +%s)
    printf "data_pull_start_time\t%d\n" $data_pull_start_time >> ${DATADIR}/times.tsv

    echo "checking access to github for ${REPO}..."
    REPO_URL="https://api.github.com/repos/${REPO}"
    curl -L --compressed -s -H "Authorization: token ${GITHUB_TOKEN}" "${REPO_URL}" > ${DATADIR}/repo
    if ! cat ${DATADIR}/repo | jq '.full_name' | ag -v '^null$' > /dev/null; then
        echo "no access to ${REPO_URL}"
        echo "check that your access token has access to the repos scope"
        echo "see GitHub's documentation here: https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line"
        echo "we need the \"repo\" top-level scope as described in the tutorial"
        exit 1
    elif ! cat ${DATADIR}/repo | jq '.full_name' | ag '^"'"${REPO}"'"$' > /dev/null; then
        NEW_REPO_NAME=$(cat ${DATADIR}/repo | jq '.full_name')
        echo "it appears that ${REPO} has been renamed to ${NEW_REPO_NAME}"
        if $FAIL_ON_RENAME; then
            echo "exiting because of repo rename"
            exit 1
        fi
    fi
    echo "access to github looks good."
    echo

    #figure out the default branch
    if [ -z "$DEFAULT_BRANCH" ]; then
        echo "autodtecting default branch from github configuration..."
        echo

        DEFAULT_BRANCH=$(cat ${DATADIR}/repo | jq -r '.default_branch')

        #TODO: check that default branch is tracked by $REPO
        implicated_remote=$(git config --get-regex branch.${DEFAULT_BRANCH}.remote | gawk '{print $2}') || true
        if ! git config --get-regex remote.${implicated_remote}.url | grep $REPO > /dev/null; then
            echo "it appears that ${DEFAULT_BRANCH} is not tracked by the github repository ${REPO}"
            echo "if you really want to proceed with analysis, you can override both ORIGIN_URL and DEFAULT_BRANCH and we won't check for this. see the README for more information"
            echo
            exit 1
        fi
    else
        echo "using provided default branch ${DEFAULT_BRANCH}"
        echo
    fi

    echo "default branch is ${DEFAULT_BRANCH}"
    echo "this is the branch whose history we'll process for merges and pulls"
    echo

    if ! git branch | grep "$DEFAULT_BRANCH" > /dev/null; then
        echo "you don't appear to have a branch called ${DEFAULT_BRANCH}"
        echo "we can't proceed without the default branch checked out"
        echo "perhaps you need to run git fetch ${ORIGIN_REMOTE} ${DEFAULT_BRANCH}?"
        exit 1
    fi

    echo "preparing code history..."
    starttime=$(date +%s)
    DATADIR=${DATADIR} DEFAULT_BRANCH=${DEFAULT_BRANCH} FILE_EXCLUDE_PATHS=${DIR}/excludefiles.regex ${DIR}/build_features.sh
    code_history_time=$(( $(date +%s) - ${starttime}))
    echo "done preparing code history in ${code_history_time}s."
    echo
    printf "code_history_time\t%f\n" ${code_history_time} >> ${DATADIR}/times.tsv

    echo "preparing commit messages..."
    git log "${DEFAULT_BRANCH}" --first-parent --format='__commit__ %H%x0A%B' -- > ${DATADIR}/commit_messages
    git log "${DEFAULT_BRANCH}" --format='%H%x09%ae' -- | LC_ALL=C sort > ${DATADIR}/commits_with_author
    git log "${DEFAULT_BRANCH}" --topo-order --format='%H%x09%P%x09%ce%x09%ct%x09%ae%x09%at' -- | gzip -c > ${DATADIR}/commit_graph.gz
    echo "done."


    #TODO: sampling on what we're pulling from github
    #   for instance, we could grab the most recent K PRs (or limit to closed PRs)
    echo "now we'll get some data from github. this might take a while."
    echo "fetching pull request and issue data..."
    starttime=$(date +%s)
    echo 'https://api.github.com/repos/'${REPO}'/languages' | GITHUB_TOKEN=$GITHUB_TOKEN ${DIR}/fetch-comments.sh > ${DATADIR}/languages.json
    echo 'https://api.github.com/repos/'${REPO}'/pulls?state=all&sort=created&direction=desc&per_page=100' | GITHUB_TOKEN=$GITHUB_TOKEN MAX_PAGES=${MAX_ALL_PR_PAGES} ALLCOMMENTS="" ${DIR}/fetch-comments.sh | gzip -c > ${DATADIR}/pulls.gz

    EARLIEST_PR=$(zcat ${DATADIR}/pulls.gz | jq -r '.[] | .created_at' | sort -r | tail -n1)
    LATEST_PR=$(zcat ${DATADIR}/pulls.gz | jq -r '.[] | .updated_at' | sort | tail -n1)
    LATEST_COMMIT=$(date -d@$(cat ${DATADIR}/commitdates | cut -f 2 | sort -n | tail -n1) --utc +%Y-%m-%dT%H:%M:%SZ)

    #tried using the latest of commit and PR, but commit makes more sense
    #for some inactive repos PRs might still get comments long after the last commit
    #LATEST_DATE=$(printf "%s\n%s\n" "${LATEST_PR}" "${LATEST_COMMIT}" | gawk '$1 > max_date { max_date = $1 } END {print max_date}')
    LATEST_DATE=$LATEST_COMMIT
    MAX_SPAN_DATE=$(date -d@$(( $(date -d "$LATEST_DATE" +%s) - $(( $MAX_SPAN_DAYS * 86400 )) )) --utc +%Y-%m-%dT%H:%M:%SZ)

    printf "EARLIEST_PR\t%d\n" $(date -d $EARLIEST_PR +%s) >> ${DATADIR}/times.tsv
    printf "LATEST_PR\t%d\n" $(date -d $LATEST_PR +%s) >> ${DATADIR}/times.tsv
    printf "LATEST_COMMIT\t%d\n" $(date -d $LATEST_COMMIT +%s) >> ${DATADIR}/times.tsv
    printf "MAX_SPAN_DATE\t%d\n" $(date -d $MAX_SPAN_DATE +%s) >> ${DATADIR}/times.tsv

    #limit to max span of analysis
    EARLIEST_DATE=$EARLIEST_PR
    if [[ "$EARLIEST_DATE" < "$MAX_SPAN_DATE" ]]; then
        EARLIEST_DATE=$MAX_SPAN_DATE
    fi

    if ! (echo 'https://api.github.com/repos/'${REPO}'/pulls/comments?since='${EARLIEST_DATE}'&sort=created&order=desc&per_page=100' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" ${DIR}/fetch-comments.sh | gzip -c > ${DATADIR}/pull-comments.gz); then
        zcat ${DATADIR}/pulls.gz | jq -r '.[] | .review_comments_url' | sed 's/$/?per_page=100/' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" SILENT=true ${DIR}/fetch-comments.sh | pv -l -s $(zcat ${DATADIR}/pulls.gz | jq -r '.[] | .review_comments_url' | wc -l) | gzip -c > ${DATADIR}/pull-comments.gz
    fi

    #TODO: issue comments might be limited to just 400 pages, need to make sure we get the issue comments for the relevant PRs
    if !(echo 'https://api.github.com/repos/'${REPO}'/issues/comments?since='${EARLIEST_DATE}'&sort=created&order=desc&per_page=100' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" ${DIR}/fetch-comments.sh | gzip -c > ${DATADIR}/issue-comments.gz);then
        zcat ${DATADIR}/pulls.gz | jq -r '.[] | .comments_url' | sed 's/$/?per_page=100/' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" SILENT=true ${DIR}/fetch-comments.sh | pv -l -s $(zcat ${DATADIR}/pulls.gz | jq -r '.[] | .comments_url' | wc -l) | gzip -c > ${DATADIR}/issue-comments.gz
    fi
    #if [ $(zcat ${DATADIR}/issue-comments.gz | zcat | wc -l) -eq 400 ]; then
        #TODO: what to do if we get cut off on comments?
        #A. replace with pull-by-pull and live with not having non-PR comments
        #B. repeatedly recompute latest date and ask for more since then
        #C. compute diff on PRs and fetch missing (maybe handle the edge case?)
        #D. ignore it? maybe measure how often this happens?
    #fi
    echo 'https://api.github.com/repos/'${REPO}'/issues?since='${EARLIEST_DATE}'&state=all&sort=created&order=desc&per_page=100' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" ${DIR}/fetch-comments.sh | gzip -c > ${DATADIR}/issues.gz
    #zcat ${DATADIR}/pulls.gz | jq -r '.[] | .issue_url' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" SILENT=true ${DIR}/fetch-comments.sh | pv -l -s $(zcat ${DATADIR}/pulls.gz | jq -r '.[] | .issue_url' | wc -l) | gzip -c > ${DATADIR}/issue.gz

    #TODO: we don't use this data anywhere, maybe don't fetch it?
    #echo 'https://api.github.com/repos/'${REPO}'/comments?per_page=100' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" ${DIR}/fetch-comments.sh | gzip -c > ${DATADIR}/commit-comments.gz

    github_fetch_time=$(( $(date +%s) - ${starttime}))
    echo "done fetching pull request and issue data in ${github_fetch_time}s."
    echo
    printf "github_fetch_time\t%f\n" ${github_fetch_time} >> ${DATADIR}/times.tsv

    echo "checking with GitHub how a sample of commits relate to pulls..."
    starttime=$(date +%s)
    cat ${DATADIR}/commitdates |\
        gawk -vrepo=${REPO} -vearliest=$(date -d "${EARLIEST_DATE}" +%s) -F\\t '$2>=earliest {printf("https://api.github.com/repos/%s/commits/%s/pulls\n", repo, $1)}' |\
        sort -R | tail -n${COMMIT_PR_SAMPLESIZE} |\
        GITHUB_TOKEN=$GITHUB_TOKEN HEADER_ACCEPT="application/vnd.github.groot-preview+json" PREFIX_URL=true SILENT=true ${DIR}/fetch-comments.sh | pv -l -s200 \
        > ${DATADIR}/commit_pulls
    github_commit_pull_time=$(( $(date +%s) - ${starttime}))
    echo "done pulling commit pull request info in ${github_commit_pull_time}s."
    echo
    printf "github_commit_pull_time\t%f\n" ${github_commit_pull_time} >> ${DATADIR}/times.tsv

    if ! (cat ${DATADIR}/pulls.gz | zcat | head -n1 || true) | grep ',    "number": [0-9]*' > /dev/null; then
        echo "there are no PRs in this repository"
        echo

        #just leave the sample files empty
        touch ${DATADIR}/pr_sample
        touch ${DATADIR}/pr_sample_pulls
        touch ${DATADIR}/pr_sample_commits
    else
        #get a sample of pulls to prep some additional data
        echo "preparing additional per-PR data for a ${PR_SAMPLESIZE} PR sample of PRs..."

        starttime=$(date +%s)
        echo "drawing the sample..."
        pv ${DATADIR}/pulls.gz | zcat | grep -o ',    "number": [0-9]*' | sed -e 's/.* \([0-9]*\)$/\1/' | sort -R | tail -n${PR_SAMPLESIZE} | LC_ALL=C sort > ${DATADIR}/pr_sample

        echo "fetching full pull objects for PR sample (this might take a while)..."
        cat ${DATADIR}/pr_sample | gawk '{printf("https://api.github.com/repos/'${REPO}'/pulls/%d?per_page=100\n", $1)}' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" SILENT=true ${DIR}/fetch-comments.sh | pv -l -s$(cat ${DATADIR}/pr_sample | wc -l) > ${DATADIR}/pr_sample_pulls

        echo "fetch commit list for PR sample (this might take a while)..."
    cat ${DATADIR}/pr_sample | gawk '{printf("https://api.github.com/repos/'${REPO}'/pulls/%d/commits?per_page=100\n", $1)}' | GITHUB_TOKEN=$GITHUB_TOKEN ALLCOMMENTS="" SILENT=true PREFIX_URL=true ${DIR}/fetch-comments.sh | pv -l -s$(cat ${DATADIR}/pr_sample | wc -l) > ${DATADIR}/pr_sample_commits

        github_pull_sample_time=$(( $(date +%s) - ${starttime}))
        echo "done collecting sample PR data in ${github_pull_sample_time}s."
        echo
        printf "github_pull_sample_time\t%f\n" ${github_pull_sample_time} >> ${DATADIR}/times.tsv
    fi

    data_pull_time=$(( $(date +%s) - ${data_pull_start_time} ))
    printf "data_pull_time\t%d\n" $data_pull_time >> ${DATADIR}/times.tsv

elif [ -d "${DATADIR}" ]; then
    echo "we'll process existing data  ${DATADIR}"
    echo

    #catch up on some state we might need
    EARLIEST_PR=$(zcat ${DATADIR}/pulls.gz | jq -r '.[] | .created_at' | sort -r | tail -n1)
    LATEST_PR=$(zcat ${DATADIR}/pulls.gz | jq -r '.[] | .updated_at' | sort | tail -n1)
    LATEST_COMMIT=$(date -d@$(cat ${DATADIR}/commitdates | cut -f 2 | sort -n | tail -n1) --utc +%Y-%m-%dT%H:%M:%SZ)

    #tried using the latest of commit and PR, but commit makes more sense
    #for some inactive repos PRs might still get comments long after the last commit
    #LATEST_DATE=$(printf "%s\n%s\n" "${LATEST_PR}" "${LATEST_COMMIT}" | gawk '$1 > max_date { max_date = $1 } END {print max_date}')
    LATEST_DATE=$LATEST_COMMIT
    MAX_SPAN_DATE=$(date -d@$(( $(date -d "$LATEST_DATE" +%s) - $(( $MAX_SPAN_DAYS * 86400 )) )) --utc +%Y-%m-%dT%H:%M:%SZ)

    #limit to max span of analysis
    EARLIEST_DATE=$EARLIEST_PR
    if [[ "$EARLIEST_DATE" < "$MAX_SPAN_DATE" ]]; then
        EARLIEST_DATE=$MAX_SPAN_DATE
    fi

    if [ -z "$DEFAULT_BRANCH" ]; then
        DEFAULT_BRANCH=$(cat ${DATADIR}/repo | jq -r '.default_branch')
    fi
else
    echo "${DATADIR} exists but isn't a directory, run without that param and we'll create a directory to hold data"
    exit 1
fi

if $SKIP_ANALYSIS; then
    echo "done fetching data. skipping analysis."
    exit 0
fi


#truncate stats time file
> ${DATADIR}/statstimes.tsv

echo "computing stats..."
starttime=$(date +%s)

printf "stats_start_time\t%d\n" $starttime >> ${DATADIR}/statstimes.tsv

#truncate stats file
> ${DATADIR}/stats.json

#metadata and identification
echo "metadata"
cat ${DATADIR}/repo | jq -c -f ${DIR}/metadata.jq >> ${DATADIR}/stats.json

echo "bytes of code per language"
cat ${DATADIR}/languages.json | jq -c '{"stat":"languages", data: .}' >> ${DATADIR}/stats.json


cat ${DATADIR}/times.tsv |\
    jq -c -R --slurp 'split("\n")[0:-1] | map(split("\t") | {(.[0]): (.[1] | tonumber)}) | add | {"stat": "data_times", "data": . }' \
    >> ${DATADIR}/stats.json

#gather PR status
echo "prepping pr statuses..."
pv ${DATADIR}/pulls.gz | zcat | jq -r '.[] | [.number, if .merged_at != null then "merged" else .state end, .created_at, .closed_at, .merge_commit_sha] | @tsv' | LC_ALL=C sort > ${DATADIR}/pr_status

echo "prepping issue statuses..."
pv ${DATADIR}/issues.gz | zcat | jq -r '.[] | [.number, .state] | @tsv' | LC_ALL=C sort | LC_ALL=C join -t$'\t' -v1 - ${DATADIR}/pr_status > ${DATADIR}/issue_status

#PR outcomes overall
echo "PR outcome counts"
cat ${DATADIR}/pr_status | gawk -F\\t 'BEGIN {OFS="\t"} {state[$2] +=1} END {for(x in state) { print x, state[x]}}' | jq -c --raw-input --slurp 'split("\n") | map(split("\t"))  | .[0:-1] | map({"state":.[0], "count":.[1]}) | {"stat":"pr_state", "data":.}' >> ${DATADIR}/stats.json

#issue outcomes overall
echo "issue outcome counts"
cat ${DATADIR}/issue_status | gawk -F\\t 'BEGIN {OFS="\t"} {state[$2] +=1} END {for(x in state) { print x, state[x]}}' | jq -c --raw-input --slurp 'split("\n") | map(split("\t"))  | .[0:-1] | map({"state":.[0], "count":.[1]}) | {"stat":"issue_state", "data":.}' >> ${DATADIR}/stats.json

#PRs over time by outcome
echo "PR outcome over time"
cat ${DATADIR}/pr_status | gawk -F\\t -i ${DIR}/date.awk 'BEGIN {OFS="\t"} {s[$2][int(parsedate($3)/(3600*24*30))*3600*24*30] += 1} END {for(x in s) { for(t in s[x]) { print x, t, s[x][t] } } }' | sort -k1,1 -k2n,2n |\
    jq -c --slurp --raw-input --arg stat_name pr_over_time -f ${DIR}/groupscatter2json.jq\
    >> ${DATADIR}/stats.json

#PR lifetime distribution
echo "PR lifetime distribution"
cat ${DATADIR}/pr_status | gawk -F\\t -i ${DIR}/date.awk 'BEGIN {OFS="\t"} function bin(t) { if(t<0) { t=1} binned = int(1.3^(int(log(t)/log(1.3)))); if(t>3600*24*365) { binned = 3600*24*365 } return binned} $2=="open" {s[$2][bin(systime() - parsedate($3))]+=1} $2!="open" { s[$2][bin(parsedate($4) - parsedate($3))] += 1} END {for(x in s) { for(t in s[x]) { print x, t, s[x][t] } } }' | sort -k1,1 -k2n,2n |\
    jq -c --slurp --raw-input --arg stat_name pr_lifetime -f ${DIR}/groupscatter2json.jq\
    >> ${DATADIR}/stats.json

#PR lifetime summary stats
echo "PR lifetime summary"
cat ${DATADIR}/pr_status | gawk -F\\t -i ${DIR}/date.awk 'BEGIN {OFS="\t"} $2=="open" {print $2, systime() - parsedate($3)} $2!="open" { print $2, parsedate($4) - parsedate($3)}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_lifetime_summary -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

#PR comments, only PRs with non-zero comments
echo "preparing comment counts..."
> ${DATADIR}/commentcounts.tmp
#code comments
zcat ${DATADIR}/pull-comments.gz |\
    jq -r '.[] | .pull_request_url' | sed 's/.*\/\([0-9]*\)$/\1/' |\
    LC_ALL=C sort | uniq -c |\
    gawk 'BEGIN {OFS="\t"} {print $2, "codecomment", $1}'\
    >> ${DATADIR}/commentcounts.tmp

#toplevel comments
zcat ${DATADIR}/issue-comments.gz |\
    jq -r '.[] | .issue_url' | sed 's/.*\/\([0-9]*\)$/\1/' |\
    LC_ALL=C sort | uniq -c |\
    gawk 'BEGIN {OFS="\t"} {print $2, "toplevel", $1}'\
    >> ${DATADIR}/commentcounts.tmp

#all comments (no zeroes)
cat\
    <(zcat ${DATADIR}/issue-comments.gz |\
        jq -r '.[] | .issue_url' | sed 's/.*\/\([0-9]*\)$/\1/')\
    <(zcat ${DATADIR}/pull-comments.gz |\
        jq -r '.[] | .pull_request_url' | sed 's/.*\/\([0-9]*\)$/\1/') |\
     LC_ALL=C sort | uniq -c |\
     gawk 'BEGIN {OFS="\t"} {print $2, "allcomments", $1}'\
    >> ${DATADIR}/commentcounts.tmp

#all comments (including zeros)
cat\
    <(zcat ${DATADIR}/issue-comments.gz |\
        jq -r '.[] | .issue_url' | sed 's/.*\/\([0-9]*\)$/\1/')\
    <(zcat ${DATADIR}/pull-comments.gz |\
        jq -r '.[] | .pull_request_url' | sed 's/.*\/\([0-9]*\)$/\1/') |\
     LC_ALL=C sort | uniq -c |\
     gawk 'BEGIN {OFS="\t"} {print $2, "allcommentswzero", $1}' |\
    LC_ALL=C join -t$'\t' -a1 <(cat ${DATADIR}/pr_status | cut -f1) - |\
    gawk -F\\t 'BEGIN {OFS="\t"} $2=="" {print $1, "allcommentswzero", 0} $2!="" {print}'\
    >> ${DATADIR}/commentcounts.tmp

cat ${DATADIR}/commentcounts.tmp | LC_ALL=C sort > ${DATADIR}/commentcounts
rm ${DATADIR}/commentcounts.tmp

echo "PR comment count summary"
cat ${DATADIR}/commentcounts | LC_ALL=C sort | join -t$'\t' ${DATADIR}/pr_status - | gawk -F\\t '{printf("%s-%s\t%d\n", $2, $6, $7)}' |\
    gawk -F\\t -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_comment_summary -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

#PR sizes (additions + deletions), a sample
echo "PR size summary"
cat ${DATADIR}/pr_sample_pulls | jq -r '[.number, .additions + .deletions] | @tsv' | join -t$'\t' ${DATADIR}/pr_status - | cut -f 2,6 |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_size_summary -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

#PR sizes vs comments
echo "PR size vs comments"
cat ${DATADIR}/pr_sample_pulls | jq -r '[.number, .additions + .deletions] | @tsv' | LC_ALL=C join -t$'\t' ${DATADIR}/pr_status - | \
    LC_ALL=C join -a1 -t$'\t' - ${DATADIR}/commentcounts \
    | gawk -F\\t 'BEGIN {OFS="\t"} $7=="" { print $2, $6, 0} $6=="allcomments" {print $2, $6, $8}' | sort |\
    jq -c --slurp --raw-input --arg stat_name pr_size_vs_comments -f ${DIR}/groupscatter2json.jq\
    >> ${DATADIR}/stats.json

#use of autolinks in several contexts
echo "gathering use of autolinks in commits..."
cat ${DATADIR}/commit_messages |\
    gawk 'BEGIN {OFS="\t"} /^__commit__ [a-f0-9]{40}$/ {commit=$2} !/^__commit__ [a-f0-9]{40}$/ {print commit, $0}' |\
    gawk -F\\t -v dsname=commit -f ${DIR}/extractautolinks.awk\
    > ${DATADIR}/commit_autolinks

#GitHub PR merge commits
#TODO: break this into one set without filtering by PRs and one set that does
echo "PR merge commits"
printf "%d\t%d\t%d\t%d\n"\
    $(cat ${DATADIR}/commit_messages | (grep -E '^__commit__ [a-f0-9]{40}$' || true) | wc -l)\
    $(cat ${DATADIR}/commit_messages | (grep -E 'Merge pull request #[0-9]+ from' || true) | wc -l)\
    $(cat ${DATADIR}/commit_messages | (grep -E -A1 '^__commit__ [a-f0-9]{40}$' || true) | (grep -E ' \(#[0-9]+\)$' || true) | wc -l)\
    $(cat ${DATADIR}/commit_autolinks | grep 'closes' | cut -f 1 | LC_ALL=C sort -u  | wc -l) |\
    jq -c --slurp --raw-input 'split("\t") | {"stat":"gh_merges", "data":{"commits":(.[0]|tonumber), "gh_merges":(.[1]|tonumber), "gh_likely_merge":(.[2]|tonumber), "likely_external_merge":(.[3]|tonumber)}}'\
    >> ${DATADIR}/stats.json

echo "PR merge commits during analysis period"
printf "%d\t%d\t%d\t%d\n"\
    $(cat ${DATADIR}/commit_messages | ag '__commit__ [a-f0-9]{40}' | gawk '{print $2}' | LC_ALL=C sort | LC_ALL=C join - ${DATADIR}/commitdates | gawk '$2 >= '$(date -d ${EARLIEST_DATE} +%s) | wc -l)\
    $(cat ${DATADIR}/commit_messages | (grep -E -o 'Merge pull request #[0-9]+ from' || true) | grep -o '[0-9]*' | LC_ALL=C sort | LC_ALL=C join - ${DATADIR}/pr_status | wc -l)\
    $(cat ${DATADIR}/commit_messages | (grep -E -A1 '^__commit__ [a-f0-9]{40}$' || true) | (grep -E -o ' \(#[0-9]+\)$' || true) | grep -o '[0-9]*' | LC_ALL=C sort | LC_ALL=C join - ${DATADIR}/pr_status | wc -l)\
    $(cat ${DATADIR}/commit_autolinks | grep 'close' | cut -f 2 | LC_ALL=C sort | LC_ALL=C join - ${DATADIR}/pr_status | wc -l) |\
    jq -c --slurp --raw-input 'split("\t") | {"stat":"gh_merges_during_prs", "data":{"commits":(.[0]|tonumber), "gh_merges":(.[1]|tonumber), "gh_likely_merge":(.[2]|tonumber), "likely_external_merge":(.[3]|tonumber)}}'\
    >> ${DATADIR}/stats.json

echo "use of autolinks in commits per commit"
cat ${DATADIR}/commit_autolinks | gawk -F\\t '{print $4, $1}' | sort | uniq -c | gawk '{print $2, $1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name autolink_commit_toany_per_commit -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "use of autolinks in commits to PR per commit"
cat ${DATADIR}/commit_autolinks | gawk -F\\t 'BEGIN {OFS="\t"} {print $2, $1, $4}' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status |\
    gawk -F\\t '{print $3, $2}' | sort | uniq -c | gawk '{print $2, $1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name autolink_commit_topr_per_commit -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "gathering use of autolinks in PRs..."
> ${DATADIR}/pull_autolinks

pv ${DATADIR}/pulls.gz | zcat | jq -r '.[] | [.number, .body] | @tsv' |\
    gawk -F\\t -v dsname=pull -f ${DIR}/extractautolinks.awk\
    >> ${DATADIR}/pull_autolinks

pv ${DATADIR}/issue-comments.gz | zcat | jq -r '.[] | [.issue_url, .body] | @tsv' |\
    sed -e 's/^https:\/\/[^\t]*\/\([0-9]*\)\t/\1\t/' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status | cut -f 1,2 |\
    gawk -F\\t -v dsname=pulltopcomment -f ${DIR}/extractautolinks.awk\
    >> ${DATADIR}/pull_autolinks

pv ${DATADIR}/pull-comments.gz | zcat | jq -r '.[] | [.pull_request_url, .body] | @tsv' |\
    sed -e 's/^https:\/\/[^\t]*\/\([0-9]*\)\t/\1\t/' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status | cut -f 1,2 |\
    gawk -F\\t -v dsname=pullcodecomment -f ${DIR}/extractautolinks.awk\
    >> ${DATADIR}/pull_autolinks

echo "use of autolinks in prs per pr"
cat ${DATADIR}/pull_autolinks | gawk '{printf("%s-%s\t%d\n", $3, $4, $1)}' | sort | uniq -c | gawk '{print $2, $1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name autolink_pr_toany_per_pr -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "use of autolinks in prs to issues per pr"
cat ${DATADIR}/pull_autolinks | gawk 'BEGIN {OFS="\t"} {print $2, $1, $3, $4}' |\
    LC_ALL=C sort | LC_ALL=C join -v1 -t$'\t' - ${DATADIR}/pr_status |\
    gawk '{printf("%s-%s\t%d\n", $3, $4, $2)}' | sort | uniq -c | gawk '{print $2, $1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name autolink_pr_toissue_per_pr -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "gathering use of autolinks in issues..."
> ${DATADIR}/issue_autolinks

pv ${DATADIR}/issues.gz | zcat | jq -r '.[] | [.number, .body] | @tsv' |\
    LC_ALL=C sort | LC_ALL=C join -v1 -t$'\t' - ${DATADIR}/pr_status | cut -f 1,2 |\
    gawk -F\\t -v dsname=issue -f ${DIR}/extractautolinks.awk\
    >> ${DATADIR}/issue_autolinks

pv ${DATADIR}/issue-comments.gz | zcat | jq -r '.[] | [.issue_url, .body] | @tsv' |\
    sed -e 's/^https:\/\/[^\t]*\/\([0-9]*\)\t/\1\t/' |\
    LC_ALL=C sort | LC_ALL=C join -v1 -t$'\t' - ${DATADIR}/pr_status | cut -f 1,2 |\
    gawk -F\\t -v dsname=issuecomment -f ${DIR}/extractautolinks.awk\
    >> ${DATADIR}/issue_autolinks

echo "use of autolinks in issues per issue"
cat ${DATADIR}/issue_autolinks | gawk '{printf("%s-%s\t%d\n", $3, $4, $1)}' | sort | uniq -c | gawk '{print $2, $1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name autolink_issue_toany_per_issue -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "use of autolinks in issues to pulls per issue"
cat ${DATADIR}/issue_autolinks | gawk 'BEGIN {OFS="\t"} {print $2, $1, $3, $4}' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status |\
    gawk '{printf("%s-%s\t%d\n", $3, $4, $2)}' | sort | uniq -c | gawk '{print $2, $1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name autolink_issue_topr_per_issue -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

#PR cycle stats
echo "preparing PR comment data..."
cat \
    <(zcat ${DATADIR}/issue-comments.gz | jq -r '.[] | [.issue_url, .user.login, .created_at, "toplevel"] | @tsv') \
    <(zcat ${DATADIR}/pull-comments.gz | jq -r '.[] | [.pull_request_url, .user.login, .created_at, "codecomment"] | @tsv') |\
    sed -e 's/^[^\t]*\/\([0-9]*\)\t/\1\t/' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status\
    > ${DATADIR}/pr_comments_data

echo "devs per PR"
cat ${DATADIR}/pr_comments_data | cut -f 1,2,4 | sort -u |\
    gawk -F\\t 'BEGIN {OFS="\t"} { print $0; print $1,$2,"any"}' |\
    cut -f 1,3 | LC_ALL=C sort | uniq -c |\
    gawk 'BEGIN {OFS="\t"} {print $2,$3,$1}' |\
    LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status | gawk -F\\t '{printf("%s-%s\t%d\n", $4,$2,$3)}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name dev_per_pr -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "comments per dev-PR"
cat ${DATADIR}/pr_comments_data | cut -f 1,2,4 |\
    gawk 'BEGIN {OFS="\t"} { print $1,$2,$3; print $1,$2,"any"}' |\
    cut -f 1,3 | LC_ALL=C sort | uniq -c |\
    gawk 'BEGIN {OFS="\t"} {print $2,$3,$1}' |\
    LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status | gawk -F\\t '{printf("%s-%s\t%d\n", $4,$2,$3)}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name comment_per_dev_pr -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "comments per dev"
cat ${DATADIR}/pr_comments_data | cut -f 1,2,4 |\
    gawk -F\\t 'BEGIN {OFS="\t"} { print $1,$2,$3; print $1,$2,"any"}' |\
    LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status |\
    gawk -F\\t '$2!="" {printf("%s\t%s-%s\n", $2,$4,$3)}' | sort | uniq -c | gawk '{print $3,$1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name comment_per_dev -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "PR cycle count per PR by outcome"
cat ${DATADIR}/pr_comments_data | gawk -F\\t -i ${DIR}/date.awk -i ${DIR}/reduce.awk -f ${DIR}/extractplies.awk |\
    cut -f 1,2,3,6 | uniq | cut -f1,4 | uniq -c | gawk '{print $3,$1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_plies_per_pr -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "computing avg comment time..."
AVG_COMMENT_TIME=$(\
    cat ${DATADIR}/pr_comments_data | gawk -F\\t -i ${DIR}/date.awk -i ${DIR}/reduce.awk -f ${DIR}/extractplies.awk |\
    gawk -F\\t -i ${DIR}/reduce.awk 'BEGIN {OFS="\t";setkey("1\t2\t3");} function startrun(key) {state=$6;startts=$4;comments=0;sumtime=0;lastts=$4} function reduce(key) {if(comments>0) {print $4-lastts;} comments+=1; lastts=$4} function endrun(key) { }' |\
    gawk '{s+=$1;n+=1} END {if(n>0) { print s/n } else { print 0} }')

echo "PR cycle time per cycle by outcome"
cat ${DATADIR}/pr_comments_data | gawk -F\\t -i ${DIR}/date.awk -i ${DIR}/reduce.awk -f ${DIR}/extractplies.awk |\
    gawk -F\\t -i ${DIR}/reduce.awk -v avgctime=$AVG_COMMENT_TIME 'BEGIN {OFS="\t";setkey("1\t2\t3");} function startrun(key) {state=$6;startts=$4;comments=0;lastts=$4} function reduce(key) { comments+=1;lastts=$4} function endrun(key) { print key[1], key[2], key[3], comments, comments*avgctime, lastts-startts, state}' |\
    gawk -F\\t '{printf("%s-estimate\t%f\n", $7, $5);if($4>0) {printf("%s-floor\t%f\n", $7, $6)}}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_ply_time_per_ply -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "PR active review time by outcome"
cat ${DATADIR}/pr_comments_data | gawk -F\\t -i ${DIR}/date.awk -i ${DIR}/reduce.awk -f ${DIR}/extractplies.awk |\
    gawk -F\\t -i ${DIR}/reduce.awk -v avgctime=$AVG_COMMENT_TIME 'BEGIN {OFS="\t";setkey("1\t2\t3");} function startrun(key) {state=$6;startts=$4;comments=0;lastts=$4} function reduce(key) { comments+=1;lastts=$4} function endrun(key) { print key[1], key[2], key[3], comments, comments*avgctime, lastts-startts, state}' |\
    gawk -F\\t -i ${DIR}/reduce.awk 'BEGIN {OFS="\t";setkey("1");} function startrun(key) {estimate=0;flr=0;state=$7} function reduce(key) {estimate+=$5;flr+=$6} function endrun(key) { printf("%s-estimate\t%f\n", state, estimate);printf("%s-floorwzero\t%f\n", state,
flr);}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_time_per_pr -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "PR active review time by outcome including zero engagement reviews"
cat ${DATADIR}/pr_comments_data | gawk -F\\t -i ${DIR}/date.awk -i ${DIR}/reduce.awk -f ${DIR}/extractplies.awk |\
    gawk -F\\t -i ${DIR}/reduce.awk -v avgctime=$AVG_COMMENT_TIME 'BEGIN {OFS="\t";setkey("1\t2\t3");} function startrun(key) {state=$6;startts=$4;comments=0;lastts=$4} function reduce(key) { comments+=1;lastts=$4} function endrun(key) { print key[1], key[2], key[3], comments, comments*avgctime, lastts-startts, state}' |\
    LC_ALL=C join -t$'\t' -o 0,2.2,2.3,2.4,2.5,2.6,1.2 -a1 ${DATADIR}/pr_status - |\
    gawk -F\\t -i ${DIR}/reduce.awk 'BEGIN {OFS="\t";setkey("1");} function startrun(key) {estimate=0;flr=0;state=$7} function reduce(key) {estimate+=$5;flr+=$6} function endrun(key) { printf("%s-estimate\t%f\n", state, estimate);printf("%s-floorwzero\t%f\n", state,
flr);}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_time_per_pr_wzero -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "PR active review time per reviewer by outcome"
cat ${DATADIR}/pr_comments_data | gawk -F\\t -i ${DIR}/date.awk -i ${DIR}/reduce.awk -f ${DIR}/extractplies.awk |\
    gawk -F\\t -i ${DIR}/reduce.awk -v avgctime=$AVG_COMMENT_TIME 'BEGIN {OFS="\t";setkey("1\t2\t3");} function startrun(key) {state=$6;startts=$4;comments=0;lastts=$4} function reduce(key) { comments+=1;lastts=$4} function endrun(key) { print key[1], key[2], key[3], comments, comments*avgctime, lastts-startts, state}' |\
    gawk -F\\t -i ${DIR}/reduce.awk 'BEGIN {OFS="\t";setkey("1\t2");} function startrun(key) {estimate=0;flr=0;state=$7} function reduce(key) {estimate+=$5;flr+=$6} function endrun(key) { printf("%s-estimate\t%f\n", state, estimate);printf("%s-floorwzero\t%f\n", state, flr);}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_time_per_dev_pr -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

#TODO: not clear this is a useful stat
#echo "PR inter cycle time per inter cycle by outcome"
#cat ${DATADIR}/pr_comments_data | gawk -F\\t -i ${DIR}/date.awk -i ${DIR}/reduce.awk -f ${DIR}/extractplies.awk |\
#    gawk -F\\t -i reduce.awk 'BEGIN {OFS="\t";setkey("1\t2\t3");} function startrun(key) {state=$6;startts=$4;comments=0;lastts=$4;createts=$6;closets=$7} function reduce(key) { comments+=1;lastts=$4} function endrun(key) { print key[1], startts, lastts, state, createts, closets}' |\
#    sort |\
#    gawk -F\\t -i reduce.awk 'BEGIN {OFS="\t";setkey("1");} function startrun(key) {lastts=$3; state=$4; closets=$7; if($2-$6>2*3600) { print key[1], $2-$6, state;}} function reduce(key) {if(lastts<$2) { print key[1], $2-lastts, state} if(lastts<$3) {lastts=$3} } function endrun(key) { if(closets!="" && closets-lastts > 2*3600) { print key[1],closets-lastts,state; } }'

#code lifetime stats
echo "code birthdate summary"
pv ${DATADIR}/metadata.gz | zcat | cut -f 4,7 |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name code_birthdate_summary -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "code lifetime summary"
pv ${DATADIR}/metadata.gz | zcat | cut -f4,5 |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name code_lifetime_summary -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "dead code lifetime distribution"
pv ${DATADIR}/metadata.gz | zcat | (ag 'died' || true) | cut -f 5 | sort -n |\
    gawk -f ${DIR}/cdf.awk <(echo -n "86400_604800_1209600_2592000_5184000_7776000_15552000_31104000" | tr '_' '\n') - |\
    jq -c --slurp --raw-input --arg stat_name code_lifetime_died_cdf -f ${DIR}/cdf2json.jq\
    >> ${DATADIR}/stats.json

echo "live code lifetime distribution"
pv ${DATADIR}/metadata.gz | zcat | (ag 'live' || true) | cut -f 5 | sort -n |\
    gawk -f ${DIR}/cdf.awk <(echo -n "86400_604800_1209600_2592000_5184000_7776000_15552000_31104000" | tr '_' '\n') - |\
    jq -c --slurp --raw-input --arg stat_name code_lifetime_live_cdf -f ${DIR}/cdf2json.jq\
    >> ${DATADIR}/stats.json


#TODO: code lifetime stats per file

#echo "code lifetime from merged PR summary"
#cat ${DATADIR}/pr_status | (grep 'merged' || true) |\
#    cut -f 5 | LC_ALL=C sort |\
#    LC_ALL=C join -t$'\t' - <(zcat ${DATADIR}/metadata.gz | gawk -F\\t 'BEGIN {OFS="\t"} {print $2, $4, $5}' | LC_ALL=C sort) |\
#    gawk -F\\t 'BEGIN {OFS="\t"} {printf("%s\t%s\n", $2, $3)}' |\
#    gawk -M -f ${DIR}/groupstats.awk |\
#    jq -c --slurp --raw-input --arg stat_name code_lifetime_merged_pr_summary -f ${DIR}/gs2json.jq\
#    >> ${DATADIR}/stats.json

#TODO: need to add in code that is merged but not reviewed (as zeros? as different case?)
#TODO: need to sample these cases
#code lifetime vs comments on PR
#comments by pr from commentcounts
#state and merge commit by pr from pr_status (just merges)
#code lifetime by commit from metadata
#echo "code lifetime vs comments on PR"
#cat ${DATADIR}/pr_status | (grep 'merged' || true) | join -t$'\t' ${DATADIR}/commentcounts -|\
#    gawk -F\\t 'BEGIN {OFS="\t"} {print $7,$4,$2,$3}' | LC_ALL=C sort |\
#    LC_ALL=C join -t$'\t' - <(zcat ${DATADIR}/metadata.gz | gawk -F\\t 'BEGIN {OFS="\t"} {print $2, $4, $5}' | LC_ALL=C sort) |\
#    gawk -F\\t 'BEGIN {OFS="\t"} {printf("%s-%s-%s\t%d\t%d\n", $2, $5, $3, $4, $6)}' |\
#    jq -c --slurp --raw-input --arg stat_name code_lifetime_vs_comments -f ${DIR}/groupscatter2json.jq\
#    >> ${DATADIR}/stats.json

echo "preparing a sample of reviewed and unreviewed commits..."

> ${DATADIR}/reviewed_commits.tmp

#commits with known GH templates
cat ${DATADIR}/commit_messages |\
    ag -A1 '^__commit__ [0-9a-f]{40}$' |\
    gawk 'BEGIN {OFS="\t"} /^__commit__ [a-f0-9]{40}$/ {commit=$2} !/^__commit__ [a-f0-9]{40}$/ {print commit, $0}' |\
    gawk -F\\t 'BEGIN {OFS="\t"} {if(match($2, /(Merge pull request #([0-9]+))|(\(#([0-9]+)\)$)/, m) > 0) { if(m[2] == "") { print m[4], $1} else { print m[2], $1}}}' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status |\
    gawk -F\\t 'BEGIN {OFS="\t"} {print $2, $1}' | LC_ALL=C sort -u\
    >> ${DATADIR}/reviewed_commits.tmp

#commits from external merge tools
cat ${DATADIR}/commit_autolinks |\
    (ag 'closes' || true) |\
    gawk -F\\t 'BEGIN {OFS="\t"} {print $2,$1}' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status |\
    gawk -F\\t 'BEGIN {OFS="\t"} {print $2,$1}' | LC_ALL=C sort -u\
    >> ${DATADIR}/reviewed_commits.tmp

#commits listed in merge_commit_sha for merged PRs
cat ${DATADIR}/pr_status |\
    (ag 'merged' || true) |\
    gawk -F\\t 'BEGIN {OFS="\t"} {print $5,$1}' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - ${DATADIR}/commitdates |\
    cut -f 1,2\
    >> ${DATADIR}/reviewed_commits.tmp

#cascade review to commits with same committer and commit time
zcat ${DATADIR}/commit_graph.gz |\
    gawk -F\\t '{printf("%s\t%d\n", $0, NR)}' |\
    LC_ALL=C sort |\
    join -t$'\t' -a2 -o0,1.2,2.3,2.4,2.5,2.7 <(cat ${DATADIR}/reviewed_commits.tmp | LC_ALL=C sort) - |\
    sort -t$'\t' -k4r,4r -k3,3 -k6n,6n |\
    gawk -F\\t '$2 == "" && $3 == last_ce && $4 == last_ct && last_pr != "" {printf("%s\t%s\n", $1, last_pr)} $2 == "" && $3 != last_cs || $4 != last_ct { last_pr = "" } $2 != "" {printf("%s\t%s\n", $1, $2)} $2!="" && ($3 != last_cs || $4 != last_ct) { last_pr=$2;last_ce=$3;last_ct=$4}' |\
    LC_ALL=C sort -u\
    > ${DATADIR}/reviewed_commits

rm ${DATADIR}/reviewed_commits.tmp

#the complement, but during the right period
zcat ${DATADIR}/commit_graph.gz |\
    gawk -F\\t '$4 >='$(date -d ${EARLIEST_DATE} +%s) | cut -f1 |\
    LC_ALL=C sort |\
    LC_ALL=C join -v1 - ${DATADIR}/reviewed_commits \
    > ${DATADIR}/unreviewed_commits

echo "commit distribution across authors (all commits on default branch)"
cat ${DATADIR}/commits_with_author |\
    gawk -F\\t '{print $2}' | sort | uniq -c | sort -rn |\
    gawk '{d[NR]=$1;s+=$1;} END {c=0; for (x in d) { c+=d[x]/s; print c}}' |\
    gawk -f ${DIR}/cdf.awk <(echo "0.1_0.25_0.5_0.75_0.8_0.9_0.95_0.99_1" | tr '_' '\n') - |\
    jq --slurp --raw-input --arg stat_name commits_proportion_by_dev_cdf -f ${DIR}/cdf2json.jq \
    >> ${DATADIR}/stats.json

echo "commit distribution across authors (during analysis period)"
cat ${DATADIR}/commits_with_author |\
    LC_ALL=C join -t$'\t' - <(cat ${DATADIR}/reviewed_commits ${DATADIR}/unreviewed_commits | LC_ALL=C sort) |\
    gawk -F\\t '{print $2}' | sort | uniq -c | sort -rn |\
    gawk '{d[NR]=$1;s+=$1;} END {c=0; for (x in d) { c+=d[x]/s; print c}}' |\
    gawk -f ${DIR}/cdf.awk <(echo "0.1_0.25_0.5_0.75_0.8_0.9_0.95_0.99_1" | tr '_' '\n') - |\
    jq --slurp --raw-input --arg stat_name during_pr_commits_proportion_by_dev_cdf -f ${DIR}/cdf2json.jq \
    >> ${DATADIR}/stats.json

echo "commit distribution across authors (reviewed)"
cat ${DATADIR}/commits_with_author |\
    LC_ALL=C join -t$'\t' - ${DATADIR}/reviewed_commits |\
    gawk -F\\t '{print $2}' | sort | uniq -c | sort -rn |\
    gawk '{d[NR]=$1;s+=$1;} END {c=0; for (x in d) { c+=d[x]/s; print c}}' |\
    gawk -f ${DIR}/cdf.awk <(echo "0.1_0.25_0.5_0.75_0.8_0.9_0.95_0.99_1" | tr '_' '\n') - |\
    jq --slurp --raw-input --arg stat_name rev_commits_proportion_by_dev_cdf -f ${DIR}/cdf2json.jq \
    >> ${DATADIR}/stats.json

echo "commit distribution across authors (unreviewed)"
cat ${DATADIR}/commits_with_author |\
    LC_ALL=C join -t$'\t' - ${DATADIR}/unreviewed_commits |\
    gawk -F\\t '{print $2}' | sort | uniq -c | sort -rn |\
    gawk '{d[NR]=$1;s+=$1;} END {c=0; for (x in d) { c+=d[x]/s; print c}}' |\
    gawk -f ${DIR}/cdf.awk <(echo "0.1_0.25_0.5_0.75_0.8_0.9_0.95_0.99_1" | tr '_' '\n') - |\
    jq --slurp --raw-input --arg stat_name unrev_commits_proportion_by_dev_cdf -f ${DIR}/cdf2json.jq \
    >> ${DATADIR}/stats.json

echo "comparing GH commit pull association with commit message analysis"
cat ${DATADIR}/commit_pulls |\
    sed -E 's/^https:\/\/api.github.com\/repos\/[^\/]*\/[^\/]*\/commits\/([a-f0-9]{40})\/pulls/\1/' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - <(cat ${DATADIR}/reviewed_commits ${DATADIR}/unreviewed_commits | LC_ALL=C sort) |\
    jq -r -R 'split("\t") | [.[0], (.[1] | fromjson | length), .[2]] | @tsv' |\
    gawk -F\\t '$3=="" {printf("unreviewed\t%d\n",$2>0)} $3!="" {printf("reviewed\t%d\n",$2>0)}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name gh_rev_vs_commit_rev -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

cat ${DATADIR}/commit_pulls |\
    sed -E 's/^https:\/\/api.github.com\/repos\/[^\/]*\/[^\/]*\/commits\/([a-f0-9]{40})\/pulls/\1/' |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' - <(cat ${DATADIR}/reviewed_commits ${DATADIR}/unreviewed_commits | LC_ALL=C sort) |\
    jq -r -R 'split("\t") | [.[0], (.[2] as $prnumber | .[1] | fromjson | map(.number) | select(($prnumber // "0" | tonumber))|length), .[2]] | @tsv' |\
    gawk -F\\t '$3=="" {printf("unreviewed\t%d\n",$2>0)} $3!="" {printf("reviewed\t%d\n",$2>0)}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name gh_rev_vs_commit_rev_strict -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "dates for reviewed vs unreviewed commits"
cat\
    <(cat ${DATADIR}/reviewed_commits | cut -f 1 |\
        LC_ALL=C join -t$'\t' - ${DATADIR}/commitdates |\
        gawk -F\\t '{printf("reviewed\t%f\n", $2)}')\
    <(cat ${DATADIR}/unreviewed_commits |\
        LC_ALL=C join -t$'\t' - ${DATADIR}/commitdates |\
        gawk -F\\t '{printf("unreviewed\t%f\n", $2)}') |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name commit_review_vs_date -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "overlap in files for reviewed vs unreviewed commits"
cat\
    <(cat ${DATADIR}/unreviewed_commits | gawk -v commits=$(cat ${DATADIR}/unreviewed_commits | wc -l) '{printf("%s\t%s\t%f\n", $1, "unreviewed", 1.0/commits)}')\
    <(cat ${DATADIR}/reviewed_commits | gawk -v commits=$(cat ${DATADIR}/reviewed_commits | wc -l) '{printf("%s\t%s\t%f\n", $1, "reviewed", 1.0/commits)}') |\
    LC_ALL=C sort | LC_ALL=C join -t$'\t' -\
        <(cat ${DATADIR}/filestatus | cut -f 1,4 | LC_ALL=C sort) |\
    gawk 'BEGIN {OFS="\t"} {print $4,$2,$3}' |\
    LC_ALL=C sort |\
    gawk -F\\t -i ${DIR}/reduce.awk -f ${DIR}/jaccard.awk |\
    jq -c --slurp --raw-input --arg stat_name commit_review_file_overlap -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

#echo "lifetime for code from reviewed vs unreviewed commits"
#cat\
#    <(cat ${DATADIR}/reviewed_commits | cut -f 1 |\
#        gawk -F\\t '{printf("%s\treviewed\n", $1)}')\
#    <(cat ${DATADIR}/unreviewed_commits |\
#        gawk -F\\t '{printf("%s\tunreviewed\n", $1)}') |\
#    LC_ALL=C sort |\
#    LC_ALL=C join -t$'\t' - <(zcat ${DATADIR}/metadata.gz | cut -f 2,5 | LC_ALL=C sort) | cut -f 2,3 |\
#    gawk -M -f ${DIR}/groupstats.awk |\
#    jq -c --slurp --raw-input --arg stat_name commit_review_vs_lifetime -f ${DIR}/gs2json.jq\
#    >> ${DATADIR}/stats.json

#cat\
#    <(cat ${DATADIR}/reviewed_commits | cut -f 1 |\
#        gawk -F\\t '{printf("%s\treviewed\n", $1)}')\
#    <(cat ${DATADIR}/unreviewed_commits |\
#        gawk -F\\t '{printf("%s\tunreviewed\n", $1)}') |\
#    LC_ALL=C sort |\
#    LC_ALL=C join -t$'\t' - <(zcat ${DATADIR}/metadata.gz | ag 'died' | cut -f 2,5 | LC_ALL=C sort) | cut -f 2,3 |\
#    gawk -M -f ${DIR}/groupstats.awk |\
#    jq -c --slurp --raw-input --arg stat_name commit_review_vs_lifetime_died -f ${DIR}/gs2json.jq\
#    >> ${DATADIR}/stats.json

#cat\
#    <(cat ${DATADIR}/reviewed_commits | cut -f 1 |\
#        gawk -F\\t '{printf("%s\treviewed\n", $1)}')\
#    <(cat ${DATADIR}/unreviewed_commits |\
#        gawk -F\\t '{printf("%s\tunreviewed\n", $1)}') |\
#    LC_ALL=C sort |\
#    LC_ALL=C join -t$'\t' - <(zcat ${DATADIR}/metadata.gz | ag 'live' | cut -f 2,5 | LC_ALL=C sort) | cut -f 2,3 |\
#    gawk -M -f ${DIR}/groupstats.awk |\
#    jq -c --slurp --raw-input --arg stat_name commit_review_vs_lifetime_live -f ${DIR}/gs2json.jq\
#    >> ${DATADIR}/stats.json

#echo "lines of code for reviewed vs unreviewed commits"
#cat\
#    <(cat ${DATADIR}/reviewed_commits | cut -f1 | gawk '{printf("%s\treviewed\n", $1)}')\
#    <(cat ${DATADIR}/unreviewed_commits | cut -f1 | gawk '{printf("%s\tunreviewed\n", $1)}') |\
#    LC_ALL=C sort |\
#    LC_ALL=C join <(zcat ${DATADIR}/metadata.gz | cut -f 2 | LC_ALL=C sort) - |\
#    LC_ALL=C sort | uniq -c |\
#    gawk '{printf("%s\t%d\n",$3,$1)}' |\
#    gawk -M -f ${DIR}/groupstats.awk |\
#    jq -c --slurp --raw-input --arg stat_name commit_review_size -f ${DIR}/gs2json.jq\
#    >> ${DATADIR}/stats.json

#echo "lines of code for reviewed vs unreviewed commits by outcome"
#cat\
#    <(cat ${DATADIR}/reviewed_commits | cut -f1 | gawk '{printf("%s\treviewed\n", $1)}')\
#    <(cat ${DATADIR}/unreviewed_commits | cut -f1 | gawk '{printf("%s\tunreviewed\n", $1)}') |\
#    LC_ALL=C sort |\
#    LC_ALL=C join <(zcat ${DATADIR}/metadata.gz | cut -f 2,4 | LC_ALL=C sort) - |\
#    LC_ALL=C sort | uniq -c |\
#    gawk '{printf("%s-%s\t%d\n",$4,$3,$1)}' |\
#    gawk -M -f ${DIR}/groupstats.awk |\
#    jq -c --slurp --raw-input --arg stat_name commit_review_size_by_outcome -f ${DIR}/gs2json.jq\
#    >> ${DATADIR}/stats.json

#TODO: commit structuredness within a PR stats
#length of PR commit messages by outcome
#size of PR commits by outcome
#number of PR commits per pr by outcome
#TBD: overlap in lines for commits in a pr

#gather PR sample comment hashes
echo "preparing comment commits..."
pv ${DATADIR}/pull-comments.gz | zcat | jq -r '.[] | [.pull_request_url, .commit_id, .original_commit_id] | @tsv' | sed 's/^https[^\t]*\/\([0-9]*\)\t/\1\t/' | LC_ALL=C sort | LC_ALL=C join -t$'\t' <(LC_ALL=C join -t$'\t' ${DATADIR}/pr_sample ${DATADIR}/pr_status) - > ${DATADIR}/pr_sample_comment_commits

#existence of PR comment commits in default branch
echo "PR comment commit exists in default branch"
cat ${DATADIR}/pr_sample_comment_commits |\
    gawk -F\\t 'BEGIN {OFS="\t"} FNR==NR {refs[$1] = 1} FNR!=NR { all[$2] += 1; commit[$2] += refs[$5]; original[$2] += refs[$6] } END {for(x in all) { print x, all[x], commit[x], original[x] } }' ${DATADIR}/commits_with_author - |\
    jq -c --slurp --raw-input 'split("\n") | map(split("\t")) | .[0:-1] | map({"state":.[0], "comments":.[1], "commit_exists":.[2], "original_exists":.[3]}) | {"stat":"pr_comment_commit_exists_default_branch", "data":.}'\
    >> ${DATADIR}/stats.json

#existence of PR comment commits in PR commit list
cat ${DATADIR}/pr_sample_commits |\
    jq --raw-input -r 'split("\t") | (.[0] | match(".*pulls/([0-9]+)/commits.*") | .captures[0].string | tonumber) as $pr_number | .[1] | fromjson | .[] | [$pr_number, .sha, .commit.committer.date] | @tsv' |\
    gawk -i ${DIR}/date.awk -F\\t 'BEGIN {OFS="\t"} {print $1, $2, parsedate($3)}'\
    > ${DATADIR}/pr_sample_commits_parsed

echo "PR comment commit exists in PR list"
cat ${DATADIR}/pr_sample_comment_commits |\
    gawk -F\\t 'BEGIN {OFS="\t"} FNR==NR {refs[$1][$2] = 1} FNR!=NR { all[$2] += 1; commit[$2] += refs[$1][$5]; original[$2] += refs[$1][$6] } END {for(x in all) { print x, all[x], commit[x], original[x] } }' ${DATADIR}/pr_sample_commits_parsed - |\
    jq -c --slurp --raw-input 'split("\n") | map(split("\t")) | .[0:-1] | map({"state":.[0], "comments":.[1], "commit_exists":.[2], "original_exists":.[3]}) | {"stat":"pr_comment_commit_exists_commitlist", "data":.}'\
    >> ${DATADIR}/stats.json

echo "PR commit exists in default branch"
cat ${DATADIR}/pr_sample_commits_parsed |\
    LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status |\
    cut -f 2,4 | LC_ALL=C sort |\
    join -a1 -t$'\t' - ${DATADIR}/commits_with_author |\
    gawk -F\\t '$3=="" { printf("%s\t0\n", $2)} $3!="" { printf("%s\t1\n", $2)}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name pr_commit_exists_default_branch -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "PR commits per PR"
cat ${DATADIR}/pr_sample_commits_parsed |\
    LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status |\
    cut -f 1,4 | sort | uniq -c |\
    gawk '{print $3,$1}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name commits_per_pr -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo "PR commit relative date to PR"
cat ${DATADIR}/pr_sample_commits_parsed |\
    LC_ALL=C join -t$'\t' - ${DATADIR}/pr_status |\
    gawk -i ${DIR}/date.awk -F\\t '{print $4, $3-parsedate($5)}' |\
    gawk -M -f ${DIR}/groupstats.awk |\
    jq -c --slurp --raw-input --arg stat_name commit_relative_age -f ${DIR}/gs2json.jq\
    >> ${DATADIR}/stats.json

echo
echo "computing timespan limited stats..."
echo

for span in $SPAN_DAYS; do
    SPAN_DATE=$(date -d@$(( $(date -d "$LATEST_DATE" +%s) - $(( $span * 86400 )) )) --utc +%Y-%m-%dT%H:%M:%SZ)
    if [[ "$SPAN_DATE" < "$EARLIEST_PR" ]]; then
        SPAN_DATE=$EARLIEST_PR
    fi
    SPAN_DAYS=$span EARLIEST_PR=$SPAN_DATE DATADIR=$DATADIR DEFAULT_BRANCH=${DEFAULT_BRANCH} ${DIR}/analyze |\
        jq -c --slurp '{"stat": "analysis_days_'${span}'", "data": . }'\
        >> ${DATADIR}/stats.json
done

compute_stats_time=$(( $(date +%s) - ${starttime}))
printf "stats_time\t%f\n" $compute_stats_time >> ${DATADIR}/statstimes.tsv
cat ${DATADIR}/statstimes.tsv |\
    jq -c -R --slurp 'split("\n")[0:-1] | map(split("\t") | {(.[0]): (.[1] | tonumber)}) | add | {"stat": "stats_times", "data": . }' \
    >> ${DATADIR}/stats.json

echo "done computing stats in in ${compute_stats_time}s."
echo

echo "Thank You! Please email back the file ${DATADIR}/stats.json"