-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjoss_extractor.py
More file actions
125 lines (98 loc) · 4.26 KB
/
joss_extractor.py
File metadata and controls
125 lines (98 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import requests
import csv
import time
from typing import List, Dict
import json
from datetime import datetime
import requests
import time
from typing import List, Dict
def fetch_joss_data() -> List[Dict]:
"""Fetch all JOSS papers data from the API"""
base_url = "https://joss.theoj.org/papers/published.json"
all_papers = []
page = 1
print("Starting extraction of JOSS papers...")
while True:
url = base_url if page == 1 else f"{base_url}?page={page}"
print(f"Fetching page {page}...")
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
if not data: # No more data
print("No more papers found. Ending extraction.")
break
all_papers.extend(data)
print(f" → Retrieved {len(data)} papers (Total so far: {len(all_papers)})")
time.sleep(0.1) # Respectful delay
page += 1
except requests.exceptions.RequestException as e:
print(f"Error fetching page {page}: {e}")
break # Stop on error (optional: you could retry instead)
print(f"Finished. Total papers fetched: {len(all_papers)}")
return all_papers
def create_csv(papers: List[Dict], filename: str = None):
"""Create CSV file with only software_repository column in quoted format"""
if filename is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"joss_repositories_{timestamp}.csv"
print(f"Creating CSV file: {filename}")
# Count repositories before writing
repositories_with_data = []
for paper in papers:
repo = paper.get('software_repository', '').strip()
if repo: # Only include non-empty repositories
repositories_with_data.append(repo)
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
# Write header
csvfile.write('software_repository\n')
# Write data with explicit quotes
for repo in repositories_with_data:
csvfile.write(f'"{repo}"\n')
print(f"✅ CSV created successfully!")
print(f"📁 Filename: {filename}")
print(f"📊 Records in CSV: {len(repositories_with_data)}")
return len(repositories_with_data), filename
def main():
"""Main execution function"""
start_time = datetime.now()
print("🚀 JOSS Papers Data Extractor")
print("=" * 50)
print(f"🕒 Started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
print()
# Fetch data
papers = fetch_joss_data()
if papers:
# Create CSV
csv_record_count, csv_filename = create_csv(papers)
# Print detailed summary with verification counts
print("\n" + "="*60)
print("📊 EXTRACTION SUMMARY")
print("="*60)
print(f"📥 Total papers processed: {len(papers)}")
print(f"📝 Records written to CSV: {csv_record_count}")
print(f"❌ Papers without repositories: {len(papers) - csv_record_count}")
print(f"📈 Repository coverage: {(csv_record_count/len(papers)*100):.1f}%")
print(f"📁 Output file: {csv_filename}")
print(f"🕒 Extraction completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# Verification check
print(f"\n🔍 VERIFICATION:")
print(f"✅ Processed {len(papers)} papers from API")
print(f"✅ Wrote {csv_record_count} repository URLs to CSV")
print(f"✅ Data integrity: {csv_record_count + (len(papers) - csv_record_count)} = {len(papers)} ✓")
# Show first few entries as preview
repositories_with_data = [p for p in papers if p.get('software_repository', '').strip()]
print(f"\n📋 Preview (first 5 repositories):")
for i, paper in enumerate(repositories_with_data[:5], 1):
repo = paper.get('software_repository', '').strip()
if repo:
print(f'{i}. "{repo}"')
else:
print("❌ No data was extracted!")
# Show total execution time
end_time = datetime.now()
duration = end_time - start_time
print(f"\n⏱️ Total execution time: {duration.total_seconds():.1f} seconds")
if __name__ == "__main__":
main()