Skip to content

Commit 4421b06

Browse files
committed
updating the scripts
1 parent c6eb0bb commit 4421b06

File tree

13 files changed

+576
-66
lines changed

13 files changed

+576
-66
lines changed

.github/workflows/deploy.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ jobs:
3434
pip install scholarly
3535
pip install bibtexparser
3636
37+
# - name: Move old dynamic to static
38+
# run: |
39+
# python scripts/migrate_files.py
40+
3741
# - name: Update publications file
3842
# run: |
3943
# python scripts/update_pubs.py

_data/pub/dynamic.bib

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
% AUTO-GENERATED FILE. DO NOT EDIT.
2-
% Updated on 2026-02-02T22:42:11.194679Z
2+
% Updated on: 2026-02-03T16:59:56.521905Z
33
44
@article{10.2139/ssrn.5943954,
55
author = {Atshan, Samer and Namgung, Min and Lee, Janghyeon and Dhankhar, Anushikha and Khobragade, Pranali and Cole, Aidan and Ailshire, Jennifer A. and Adar, Sara D. and Chiang, Yao-Yi and Lee, Jinkook and Nichols, Emma},
66
doi = {10.2139/ssrn.5943954},
77
issn = {1556-5068},
88
journal = {SSRN Electronic Journal},
99
publisher = {Elsevier BV},
10-
title = {<p>Validating Machine Learning–Derived Built Environment Measures From Google Street View for Urban Aging Research in India</p>},
10+
title = {Validating Machine Learning–Derived Built Environment Measures From Google Street View for Urban Aging Research in India},
1111
url = {https://doi.org/10.2139/ssrn.5943954},
1212
year = {2025}
1313
}
@@ -307,7 +307,7 @@ @inproceedings{10.1007/978-3-032-09530-5_26
307307
isbn = {978-3-032-09530-5},
308308
pages = {451--471},
309309
publisher = {Springer Nature Switzerland},
310-
title = {Exploiting LLMs and Semantic Technologies to Build a Knowledge Graph of Historical Mining Data},
310+
title = {Exploiting LLMs and Semantic Technologies to Build a Knowledge Graph of Historical Mining Data},
311311
url = {https://link.springer.com/chapter/10.1007/978-3-032-09530-5_26},
312312
year = {2026}
313313
}
@@ -325,7 +325,7 @@ @inproceedings{10.1007/978-3-032-04617-8_3
325325
isbn = {978-3-032-04617-8},
326326
pages = {40--59},
327327
publisher = {Springer Nature Switzerland},
328-
title = {LDTR: Linear Object Detection Transformer for Accurate Graph Generation by Learning the N-Hop Connectivity Information},
328+
title = {LDTR: Linear Object Detection Transformer for Accurate Graph Generation by Learning the N-Hop Connectivity Information},
329329
url = {https://link.springer.com/chapter/10.1007/978-3-032-04617-8_3},
330330
year = {2026}
331331
}
@@ -345,7 +345,7 @@ @inproceedings{10.1007/978-3-032-04617-8_4
345345
isbn = {978-3-032-04617-8},
346346
pages = {60--77},
347347
publisher = {Springer Nature Switzerland},
348-
title = {LIGHT: Multi-modal Text Linking on Historical Maps},
348+
title = {LIGHT: Multi-modal Text Linking on Historical Maps},
349349
url = {https://link.springer.com/chapter/10.1007/978-3-032-04617-8_4},
350350
year = {2026}
351351
}

scripts/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .providers import *

scripts/_archived_full.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ def token_set_ratio(a: str, b: str) -> float:
111111
def strip_html_tags(s: str) -> str:
112112
if not s:
113113
return ""
114-
# Crossref abstracts are sometimes JATS-ish; strip tags crudely.
115114
s = re.sub(r"<[^>]+>", " ", s)
116115
s = html.unescape(s)
117116
return normalize_ws(s)
@@ -320,15 +319,15 @@ def springer_bibtex_by_doi(doi: str) -> str:
320319
def get_bibtex_with_fallback(p_full: dict, title: str) -> str:
321320
# 1) Try directly
322321
try:
323-
s = scholarly.bibtex(p_full)
322+
s = scholarly.bibtex(p_full) # Get bibtext directly
324323
if s:
325324
return s
326325
except Exception:
327326
pass
328327

329328
# 2) Fallback: search by title
330329
try:
331-
q = scholarly.search_pubs(title)
330+
q = scholarly.search_pubs(title) # Search by title
332331
pub2 = next(q, None)
333332
if not pub2:
334333
return ""
@@ -854,7 +853,6 @@ def main():
854853
best_doi = (best.get("DOI") or "").strip()
855854
if best_doi:
856855
crossref_bib = crossref_bibtex_transform(best_doi)
857-
# also fetch message if you want abstract etc.
858856
try:
859857
crossref_msg = crossref_lookup_by_doi(best_doi)
860858
except Exception:

scripts/helpers.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from typing import List
2+
3+
import re
4+
5+
import bibtexparser
6+
7+
# Regex patterns
8+
DOI_RE = re.compile(r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.I)
9+
ARXIV_WORD_RE = re.compile(r"\barxiv\b", re.I)
10+
11+
def extract_doi_any(list_text: List[str]) -> str:
12+
"""
13+
Extract DOI from any type of input string
14+
"""
15+
16+
for text in list_text:
17+
if not text:
18+
continue
19+
20+
m = DOI_RE.search(text)
21+
if m:
22+
return m.group(1)
23+
24+
return ""
25+
26+
def citation_type(venue:str=None,
27+
citation:str=None,
28+
scholar_bibtex:str=None,
29+
link:str=None,
30+
doi:str=None) -> str:
31+
"""
32+
Determine citation type to pass determine which website for bib extraction to use
33+
"""
34+
if ARXIV_WORD_RE.search(venue or "") or \
35+
ARXIV_WORD_RE.search(citation) or \
36+
ARXIV_WORD_RE.search(scholar_bibtex) or \
37+
ARXIV_WORD_RE.search(link):
38+
return 'arxiv'
39+
40+
elif doi.startswith("10.1145/"):
41+
return 'acm'
42+
43+
elif doi.startswith("10.1007/"):
44+
return 'springer'
45+
46+
return 'fallback'
47+
48+
def bibtex_to_fields(bibtex_str: str) -> dict:
49+
"""
50+
Parse bibtex to dictionary
51+
"""
52+
if not bibtex_str:
53+
return {}
54+
try:
55+
db = bibtexparser.loads(bibtex_str)
56+
except Exception:
57+
return {}
58+
if not getattr(db, "entries", None):
59+
return {}
60+
e = db.entries[0]
61+
out = {}
62+
for k, v in e.items():
63+
if v is None:
64+
continue
65+
out[str(k).lower().strip()] = str(v).strip()
66+
return out

scripts/providers/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from .arxiv import arxiv_entry, extract_arxiv_any
2+
from .acm import acm_bibtex_by_doi
3+
from .springer import springer_bibtex_by_doi
4+
from .crossref import crossref_entry
5+
from .scholarlyp import get_bibtex_with_fallback
6+
from .utils import build_entry_bibtex

scripts/providers/acm.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import requests
2+
3+
UA_HEADERS = {
4+
"User-Agent": (
5+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
6+
"(KHTML, like Gecko) Chrome/120 Safari/537.36"
7+
)
8+
}
9+
10+
def acm_bibtex_by_doi(doi: str) -> str:
11+
"""
12+
Bibtex from ACM
13+
"""
14+
if not doi.startswith("10.1145/"):
15+
return ""
16+
17+
url = "https://dl.acm.org/action/downloadCitation"
18+
params = {"doi": doi, "format": "bibtex"}
19+
try:
20+
r = requests.get(url, params=params, headers=UA_HEADERS, timeout=25)
21+
if r.status_code == 200 and "@" in r.text:
22+
return r.text.strip()
23+
except Exception:
24+
pass
25+
26+
return ""

scripts/providers/arxiv.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
from typing import List
2+
3+
import re
4+
5+
import html
6+
import requests
7+
8+
from .utils import *
9+
10+
ARXIV_ID_RE = re.compile(
11+
r"(?:arxiv\.org/(?:abs|pdf)/)(?P<id>(?:\d{4}\.\d{4,5}|[a-z\-]+/\d{7})(?:v\d+)?)",
12+
re.I,
13+
)
14+
15+
UA_HEADERS = {
16+
"User-Agent": (
17+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
18+
"(KHTML, like Gecko) Chrome/120 Safari/537.36"
19+
)
20+
}
21+
22+
def extract_arxiv_any(list_text: List[str]) -> str:
23+
"""
24+
Extract ArXiV ID from any type of input string
25+
"""
26+
for text in list_text:
27+
if not text:
28+
continue
29+
30+
m = ARXIV_ID_RE.search(text)
31+
if m:
32+
return m.group("id")
33+
34+
m2 = re.search(r"\barxiv:\s*([0-9]{4}\.[0-9]{4,5}(?:v\d+)?)\b", text, re.I)
35+
if m2:
36+
return m2.group(1)
37+
38+
return ""
39+
40+
def xml_text(tag: str,
41+
xml: str) -> str:
42+
m = re.search(rf"<{tag}[^>]*>(.*?)</{tag}>", xml, flags=re.S | re.I)
43+
return html.unescape(m.group(1)).strip() if m else ""
44+
45+
def arxiv_meta_by_id(arxiv_id: str) -> dict:
46+
"""
47+
Meta search by ArXiv ID
48+
49+
"""
50+
url = "http://export.arxiv.org/api/query"
51+
params = {"id_list": arxiv_id}
52+
r = requests.get(url, params=params, headers=UA_HEADERS, timeout=25)
53+
r.raise_for_status()
54+
xml = r.text
55+
56+
title = normalize_ws(xml_text("title", xml))
57+
titles = re.findall(r"<title[^>]*>(.*?)</title>", xml, flags=re.S | re.I)
58+
if titles:
59+
title = normalize_ws(html.unescape(titles[-1]))
60+
61+
summary = normalize_ws(xml_text("summary", xml))
62+
published = xml_text("publishedm", xml)
63+
year = published[:4] if published[:4].isdigit() else ""
64+
65+
names = re.findall(r"<author>\s*<name>(.*?)</name>\s*</author>", xml, flags=re.S | re.I)
66+
authors = " and ".join(normalize_ws(html.unescape(n)) for n in names if normalize_ws(n))
67+
68+
entry_id = xml_text("id", xml)
69+
entry_id = normalize_ws(entry_id)
70+
71+
mcat = re.search(r'<arxiv:primary_category[^>]+term="([^"]+)"', xml, flags=re.I)
72+
primary_cat = mcat.group(1).strip() if mcat else ""
73+
74+
return {
75+
"title": title,
76+
"authors": authors,
77+
"year": year,
78+
"url": entry_id,
79+
"abstract": summary,
80+
"primary_category": primary_cat,
81+
}
82+
83+
def arxiv_meta_by_title(title: str,
84+
first_author: str = "") -> dict:
85+
"""
86+
Search arXiv API by title and pick best match.
87+
"""
88+
qtitle = title.replace('"', "")
89+
search = f'ti:"{qtitle}"'
90+
91+
if first_author:
92+
fa = first_author.split(",", 1)[0].split()[-1]
93+
search = f'{search} AND au:{fa}'
94+
95+
url = "http://export.arxiv.org/api/query"
96+
params = {"search_query": search, "start": 0, "max_results": 5}
97+
r = requests.get(url, params=params, headers=UA_HEADERS, timeout=25)
98+
r.raise_for_status()
99+
xml = r.text
100+
101+
entries = re.split(r"</entry>\s*", xml, flags=re.I)
102+
best = None
103+
best_score = 0.0
104+
105+
for e in entries:
106+
if "<entry" not in e.lower():
107+
continue
108+
109+
titles = re.findall(r"<title[^>]*>(.*?)</title>", e, flags=re.S | re.I)
110+
if not titles:
111+
continue
112+
113+
etitle = normalize_ws(html.unescape(titles[-1]))
114+
115+
score = seq_ratio(normalize_title(title), normalize_title(etitle))
116+
if score > best_score:
117+
# extract id
118+
mid = re.search(r"<id[^>]*>(.*?)</id>", e, flags=re.S | re.I)
119+
eid = normalize_ws(html.unescape(mid.group(1))) if mid else ""
120+
# arxiv id from url
121+
arxiv_id = extract_arxiv_any([eid]) or ""
122+
best_score = score
123+
best = {"arxiv_id": arxiv_id, "entry_url": eid, "matched_title": etitle, "score": score}
124+
125+
return best or {}
126+
127+
128+
129+
def _build_arxiv_meta(arxiv_id:str=None,
130+
title:str=None,
131+
authors_guess:str=None) -> None:
132+
if not arxiv_id:
133+
best_guess = arxiv_meta_by_title(title,
134+
first_author=authors_guess)
135+
if best_guess.get("arxiv_id"):
136+
arxiv_id = best_guess.get("arxiv_id")
137+
138+
try:
139+
return arxiv_meta_by_id(arxiv_id)
140+
except Exception:
141+
return {}
142+
143+
def arxiv_entry(base_title: str,
144+
base_year: str,
145+
base_venue: str,
146+
base_link: str,
147+
authors_guess: str,
148+
scholar_bibtex: str,
149+
citation:str, ) -> dict:
150+
"""
151+
Build Bibtex from arXiv
152+
"""
153+
154+
arxiv_id = extract_arxiv_any([base_link, scholar_bibtex, citation])
155+
arxiv_meta = _build_arxiv_meta(arxiv_id)
156+
157+
title = arxiv_meta.get("title") or base_title
158+
authors = arxiv_meta.get("authors") or normalize_authors_to_bibtex(authors_guess)
159+
year = arxiv_meta.get("year") or base_year
160+
url = arxiv_meta.get("url") or base_link
161+
162+
entry = {
163+
"ENTRYTYPE": "misc",
164+
"title": title,
165+
"author": normalize_authors_to_bibtex(authors),
166+
"year": year,
167+
"howpublished": "arXiv",
168+
"url": url,
169+
}
170+
171+
if arxiv_meta.get("primary_category"):
172+
entry["primaryclass"] = arxiv_meta["primary_category"]
173+
174+
# Add abstract (requested)
175+
if arxiv_meta.get("abstract"):
176+
entry["abstract"] = arxiv_meta["abstract"]
177+
178+
# Bib key
179+
entry["ID"] = make_bib_key(entry.get("author", ""), entry.get("year", ""), entry.get("title", ""))
180+
181+
return entry

scripts/providers/crossref.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
def clean_crossref_text(s: str) -> str:
2+
"""
3+
Convert HTML entities (&lt; etc.) to characters, strip tags (<p>),
4+
and normalize whitespace.
5+
"""
6+
if not s:
7+
return ""
8+
s = html.unescape(s) # &lt;p&gt; -> <p>
9+
s = re.sub(r"<[^>]+>", " ", s) # remove tags like <p>
10+
return normalize_ws(s)

0 commit comments

Comments
 (0)