updating the scripts

YOO-uN-ee · YOO-uN-ee · commit 4421b06dd799 · 2026-02-03T17:25:43.000-06:00
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -34,6 +34,10 @@ jobs:
           pip install scholarly
           pip install bibtexparser
 
+      # - name: Move old dynamic to static
+      #   run: |
+      #     python scripts/migrate_files.py
+
     #   - name: Update publications file
     #     run: |
     #       python scripts/update_pubs.py
diff --git a/_data/pub/dynamic.bib b/_data/pub/dynamic.bib
@@ -1,13 +1,13 @@
 % AUTO-GENERATED FILE. DO NOT EDIT.
-% Updated on 2026-02-02T22:42:11.194679Z
+% Updated on: 2026-02-03T16:59:56.521905Z
 
 @article{10.2139/ssrn.5943954,
   author = {Atshan, Samer and Namgung, Min and Lee, Janghyeon and Dhankhar, Anushikha and Khobragade, Pranali and Cole, Aidan and Ailshire, Jennifer A. and Adar, Sara D. and Chiang, Yao-Yi and Lee, Jinkook and Nichols, Emma},
   doi = {10.2139/ssrn.5943954},
   issn = {1556-5068},
   journal = {SSRN Electronic Journal},
   publisher = {Elsevier BV},
-  title = {&lt;p&gt;Validating Machine Learning–Derived Built Environment Measures From Google Street View for Urban Aging Research in India&lt;/p&gt;},
+  title = {Validating Machine Learning–Derived Built Environment Measures From Google Street View for Urban Aging Research in India},
   url = {https://doi.org/10.2139/ssrn.5943954},
   year = {2025}
 }
@@ -307,7 +307,7 @@ @inproceedings{10.1007/978-3-032-09530-5_26
   isbn = {978-3-032-09530-5},
   pages = {451--471},
   publisher = {Springer Nature Switzerland},
-  title = {Exploiting LLMs and Semantic Technologies to Build a Knowledge Graph of Historical Mining Data},
+  title = {Exploiting LLMs and Semantic Technologies to Build a Knowledge Graph of Historical Mining Data},
   url = {https://link.springer.com/chapter/10.1007/978-3-032-09530-5_26},
   year = {2026}
 }
@@ -325,7 +325,7 @@ @inproceedings{10.1007/978-3-032-04617-8_3
   isbn = {978-3-032-04617-8},
   pages = {40--59},
   publisher = {Springer Nature Switzerland},
-  title = {LDTR: Linear Object Detection Transformer for Accurate Graph Generation by Learning the N-Hop Connectivity Information},
+  title = {LDTR: Linear Object Detection Transformer for Accurate Graph Generation by Learning the N-Hop Connectivity Information},
   url = {https://link.springer.com/chapter/10.1007/978-3-032-04617-8_3},
   year = {2026}
 }
@@ -345,7 +345,7 @@ @inproceedings{10.1007/978-3-032-04617-8_4
   isbn = {978-3-032-04617-8},
   pages = {60--77},
   publisher = {Springer Nature Switzerland},
-  title = {LIGHT: Multi-modal Text Linking on Historical Maps},
+  title = {LIGHT: Multi-modal Text Linking on Historical Maps},
   url = {https://link.springer.com/chapter/10.1007/978-3-032-04617-8_4},
   year = {2026}
 }
diff --git a/scripts/__init__.py b/scripts/__init__.py
@@ -0,0 +1 @@
+from .providers import *
diff --git a/scripts/_archived_full.py b/scripts/_archived_full.py
@@ -111,7 +111,6 @@ def token_set_ratio(a: str, b: str) -> float:
 def strip_html_tags(s: str) -> str:
     if not s:
         return ""
-    # Crossref abstracts are sometimes JATS-ish; strip tags crudely.
     s = re.sub(r"<[^>]+>", " ", s)
     s = html.unescape(s)
     return normalize_ws(s)
@@ -320,15 +319,15 @@ def springer_bibtex_by_doi(doi: str) -> str:
 def get_bibtex_with_fallback(p_full: dict, title: str) -> str:
     # 1) Try directly
     try:
-        s = scholarly.bibtex(p_full)
+        s = scholarly.bibtex(p_full)        # Get bibtext directly
         if s:
             return s
     except Exception:
         pass
 
     # 2) Fallback: search by title
     try:
-        q = scholarly.search_pubs(title)
+        q = scholarly.search_pubs(title)    # Search by title
         pub2 = next(q, None)
         if not pub2:
             return ""
@@ -854,7 +853,6 @@ def main():
                 best_doi = (best.get("DOI") or "").strip()
                 if best_doi:
                     crossref_bib = crossref_bibtex_transform(best_doi)
-                    # also fetch message if you want abstract etc.
                     try:
                         crossref_msg = crossref_lookup_by_doi(best_doi)
                     except Exception:
diff --git a/scripts/helpers.py b/scripts/helpers.py
@@ -0,0 +1,66 @@
+from typing import List
+
+import re
+
+import bibtexparser
+
+# Regex patterns
+DOI_RE = re.compile(r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.I)
+ARXIV_WORD_RE = re.compile(r"\barxiv\b", re.I)
+
+def extract_doi_any(list_text: List[str]) -> str:
+    """
+    Extract DOI from any type of input string
+    """
+
+    for text in list_text:
+        if not text:
+            continue
+
+        m = DOI_RE.search(text)
+        if m:
+            return m.group(1)
+        
+    return ""
+
+def citation_type(venue:str=None,
+                  citation:str=None,
+                  scholar_bibtex:str=None,
+                  link:str=None,
+                  doi:str=None) -> str:
+    """
+    Determine citation type to pass determine which website for bib extraction to use
+    """
+    if ARXIV_WORD_RE.search(venue or "") or \
+       ARXIV_WORD_RE.search(citation) or \
+       ARXIV_WORD_RE.search(scholar_bibtex) or \
+       ARXIV_WORD_RE.search(link):
+         return 'arxiv'
+    
+    elif doi.startswith("10.1145/"):
+        return 'acm'
+    
+    elif doi.startswith("10.1007/"):
+        return 'springer'
+    
+    return 'fallback'
+
+def bibtex_to_fields(bibtex_str: str) -> dict:
+    """
+    Parse bibtex to dictionary
+    """
+    if not bibtex_str:
+        return {}
+    try:
+        db = bibtexparser.loads(bibtex_str)
+    except Exception:
+        return {}
+    if not getattr(db, "entries", None):
+        return {}
+    e = db.entries[0]
+    out = {}
+    for k, v in e.items():
+        if v is None:
+            continue
+        out[str(k).lower().strip()] = str(v).strip()
+    return out
diff --git a/scripts/providers/__init__.py b/scripts/providers/__init__.py
@@ -0,0 +1,6 @@
+from .arxiv import arxiv_entry, extract_arxiv_any
+from .acm import acm_bibtex_by_doi
+from .springer import springer_bibtex_by_doi
+from .crossref import crossref_entry
+from .scholarlyp import get_bibtex_with_fallback
+from .utils import build_entry_bibtex
diff --git a/scripts/providers/acm.py b/scripts/providers/acm.py
@@ -0,0 +1,26 @@
+import requests
+
+UA_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/120 Safari/537.36"
+    )
+}
+
+def acm_bibtex_by_doi(doi: str) -> str:
+    """
+    Bibtex from ACM
+    """
+    if not doi.startswith("10.1145/"):
+        return ""
+
+    url = "https://dl.acm.org/action/downloadCitation"
+    params = {"doi": doi, "format": "bibtex"}
+    try:
+        r = requests.get(url, params=params, headers=UA_HEADERS, timeout=25)
+        if r.status_code == 200 and "@" in r.text:
+            return r.text.strip()
+    except Exception:
+        pass
+
+    return ""
diff --git a/scripts/providers/arxiv.py b/scripts/providers/arxiv.py
@@ -0,0 +1,181 @@
+from typing import List
+
+import re
+
+import html
+import requests
+
+from .utils import *
+
+ARXIV_ID_RE = re.compile(
+    r"(?:arxiv\.org/(?:abs|pdf)/)(?P<id>(?:\d{4}\.\d{4,5}|[a-z\-]+/\d{7})(?:v\d+)?)",
+    re.I,
+)
+
+UA_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/120 Safari/537.36"
+    )
+}
+
+def extract_arxiv_any(list_text: List[str]) -> str:
+    """
+    Extract ArXiV ID from any type of input string
+    """
+    for text in list_text:
+        if not text:
+            continue
+
+        m = ARXIV_ID_RE.search(text)
+        if m:
+            return m.group("id")
+        
+        m2 = re.search(r"\barxiv:\s*([0-9]{4}\.[0-9]{4,5}(?:v\d+)?)\b", text, re.I)
+        if m2:
+            return m2.group(1)
+
+    return ""
+
+def xml_text(tag: str,
+             xml: str) -> str:
+    m = re.search(rf"<{tag}[^>]*>(.*?)</{tag}>", xml, flags=re.S | re.I)
+    return html.unescape(m.group(1)).strip() if m else ""
+
+def arxiv_meta_by_id(arxiv_id: str) -> dict:
+    """
+    Meta search by ArXiv ID
+
+    """
+    url = "http://export.arxiv.org/api/query"
+    params = {"id_list": arxiv_id}
+    r = requests.get(url, params=params, headers=UA_HEADERS, timeout=25)
+    r.raise_for_status()
+    xml = r.text
+
+    title = normalize_ws(xml_text("title", xml))
+    titles = re.findall(r"<title[^>]*>(.*?)</title>", xml, flags=re.S | re.I)
+    if titles:
+        title = normalize_ws(html.unescape(titles[-1]))
+
+    summary = normalize_ws(xml_text("summary", xml))
+    published = xml_text("publishedm", xml)
+    year = published[:4] if published[:4].isdigit() else ""
+
+    names = re.findall(r"<author>\s*<name>(.*?)</name>\s*</author>", xml, flags=re.S | re.I)
+    authors = " and ".join(normalize_ws(html.unescape(n)) for n in names if normalize_ws(n))
+
+    entry_id = xml_text("id", xml)
+    entry_id = normalize_ws(entry_id)
+
+    mcat = re.search(r'<arxiv:primary_category[^>]+term="([^"]+)"', xml, flags=re.I)
+    primary_cat = mcat.group(1).strip() if mcat else ""
+
+    return {
+        "title": title,
+        "authors": authors,
+        "year": year,
+        "url": entry_id,
+        "abstract": summary,
+        "primary_category": primary_cat,
+    }
+
+def arxiv_meta_by_title(title: str, 
+                        first_author: str = "") -> dict:
+    """
+    Search arXiv API by title and pick best match.
+    """
+    qtitle = title.replace('"', "")
+    search = f'ti:"{qtitle}"'
+
+    if first_author:
+        fa = first_author.split(",", 1)[0].split()[-1]
+        search = f'{search} AND au:{fa}'
+
+    url = "http://export.arxiv.org/api/query"
+    params = {"search_query": search, "start": 0, "max_results": 5}
+    r = requests.get(url, params=params, headers=UA_HEADERS, timeout=25)
+    r.raise_for_status()
+    xml = r.text
+
+    entries = re.split(r"</entry>\s*", xml, flags=re.I)
+    best = None
+    best_score = 0.0
+
+    for e in entries:
+        if "<entry" not in e.lower():
+            continue
+
+        titles = re.findall(r"<title[^>]*>(.*?)</title>", e, flags=re.S | re.I)
+        if not titles:
+            continue
+
+        etitle = normalize_ws(html.unescape(titles[-1]))
+
+        score = seq_ratio(normalize_title(title), normalize_title(etitle))
+        if score > best_score:
+            # extract id
+            mid = re.search(r"<id[^>]*>(.*?)</id>", e, flags=re.S | re.I)
+            eid = normalize_ws(html.unescape(mid.group(1))) if mid else ""
+            # arxiv id from url
+            arxiv_id = extract_arxiv_any([eid]) or ""
+            best_score = score
+            best = {"arxiv_id": arxiv_id, "entry_url": eid, "matched_title": etitle, "score": score}
+
+    return best or {}
+
+
+
+def _build_arxiv_meta(arxiv_id:str=None,
+                      title:str=None,
+                      authors_guess:str=None) -> None:
+    if not arxiv_id:
+        best_guess = arxiv_meta_by_title(title, 
+                                         first_author=authors_guess)
+        if best_guess.get("arxiv_id"):
+            arxiv_id = best_guess.get("arxiv_id")
+
+    try: 
+        return arxiv_meta_by_id(arxiv_id)
+    except Exception:
+        return {}
+
+def arxiv_entry(base_title: str, 
+                base_year: str, 
+                base_venue: str, 
+                base_link: str, 
+                authors_guess: str,
+                scholar_bibtex: str,
+                citation:str, ) -> dict:
+    """
+    Build Bibtex from arXiv
+    """
+
+    arxiv_id = extract_arxiv_any([base_link, scholar_bibtex, citation])
+    arxiv_meta = _build_arxiv_meta(arxiv_id)
+
+    title = arxiv_meta.get("title") or base_title
+    authors = arxiv_meta.get("authors") or normalize_authors_to_bibtex(authors_guess)
+    year = arxiv_meta.get("year") or base_year
+    url = arxiv_meta.get("url") or base_link
+
+    entry = {
+        "ENTRYTYPE": "misc",
+        "title": title,
+        "author": normalize_authors_to_bibtex(authors),
+        "year": year,
+        "howpublished": "arXiv",
+        "url": url,
+    }
+
+    if arxiv_meta.get("primary_category"):
+        entry["primaryclass"] = arxiv_meta["primary_category"]
+
+    # Add abstract (requested)
+    if arxiv_meta.get("abstract"):
+        entry["abstract"] = arxiv_meta["abstract"]
+
+    # Bib key
+    entry["ID"] = make_bib_key(entry.get("author", ""), entry.get("year", ""), entry.get("title", ""))
+
+    return entry
diff --git a/scripts/providers/crossref.py b/scripts/providers/crossref.py
@@ -0,0 +1,10 @@
+def clean_crossref_text(s: str) -> str:
+    """
+    Convert HTML entities (&lt; etc.) to characters, strip tags (<p>),
+    and normalize whitespace.
+    """
+    if not s:
+        return ""
+    s = html.unescape(s)          # &lt;p&gt; -> <p>
+    s = re.sub(r"<[^>]+>", " ", s)  # remove tags like <p>
+    return normalize_ws(s)
diff --git a/scripts/providers/scholarlyp.py b/scripts/providers/scholarlyp.py
diff --git a/scripts/providers/springer.py b/scripts/providers/springer.py
diff --git a/scripts/providers/utils.py b/scripts/providers/utils.py
diff --git a/scripts/update_pubs.py b/scripts/update_pubs.py