diff --git a/atcoder-problems-backend/crawler/src/parser.rs b/atcoder-problems-backend/crawler/src/parser.rs index 14b5f900..d1fd00e3 100644 --- a/atcoder-problems-backend/crawler/src/parser.rs +++ b/atcoder-problems-backend/crawler/src/parser.rs @@ -251,159 +251,125 @@ pub fn parse_submissions_html(html_content: &str) -> Result, Cra let mut submissions = Vec::new(); + let td_selector = + Selector::parse("td").map_err(|e| CrawlerError::SelectorError(e.to_string()))?; + let input_selector = + Selector::parse("input").map_err(|e| CrawlerError::SelectorError(e.to_string()))?; + let time_selector = + Selector::parse("time").map_err(|e| CrawlerError::SelectorError(e.to_string()))?; + let a_selector = + Selector::parse("a").map_err(|e| CrawlerError::SelectorError(e.to_string()))?; + let span_selector = + Selector::parse("span").map_err(|e| CrawlerError::SelectorError(e.to_string()))?; + let details_selector = Selector::parse("a.submission-details-link") + .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; + for row in document.select(&row_selector) { - // Extract submission ID from the details link - let details_selector = Selector::parse("td:last-child a.submission-details-link") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let details_element = row.select(&details_selector).next(); - - let id = if let Some(details_elem) = details_element { - if let Some(href) = details_elem.value().attr("href") { - // Extract ID from the URL (e.g., "/contests/abc399/submissions/64188418" -> "64188418") - href.split('/') - .next_back() - .unwrap_or("") - .parse::() - .map_err(|e| { - CrawlerError::ParseError(format!("Failed to parse submission ID: {}", e)) - })? - } else { - continue; // Skip if no URL is found - } - } else { - continue; // Skip if no details link is found + let tds = row.select(&td_selector).collect::>(); + + // Contests where the crawler's account has rejudge privileges (e.g. an ABC + // held jointly with another contest the account staffs) render an extra + // leading checkbox column, shifting every other column right by one. + // Detect that column so the offsets below stay correct. + let offset = match tds.first() { + Some(first) if first.select(&input_selector).next().is_some() => 1, + _ => 0, }; - // Extract submission date - let date_selector = Selector::parse("td:first-child time") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let date_element = row.select(&date_selector).next(); - - let epoch_second = if let Some(date_elem) = date_element { - let date_str = date_elem.text().collect::(); - // Parse the date string (e.g., "2024-04-05 12:34:56+0900") - let datetime = DateTime::parse_from_str(&date_str, "%Y-%m-%d %H:%M:%S%z") - .map_err(|e| CrawlerError::ParseError(format!("Failed to parse date: {}", e)))?; - datetime.timestamp() - } else { + // Columns after the offset: date, problem, user, language, score, + // code length, result, execution time. + if tds.len() < offset + 8 { + continue; // Not a submission row + } + + // Submission ID and detail URL come from the details link, located by + // class so it is independent of the column offset. + let Some(href) = row + .select(&details_selector) + .next() + .and_then(|e| e.value().attr("href")) + else { + continue; // Skip rows without a submission details link + }; + // Extract ID from the URL (e.g. "/contests/abc399/submissions/64188418" -> "64188418") + let id = href + .split('/') + .next_back() + .unwrap_or("") + .parse::() + .map_err(|e| { + CrawlerError::ParseError(format!("Failed to parse submission ID: {}", e)) + })?; + // Extract contest ID from the URL (e.g. "/contests/abc399/submissions/64188418" -> "abc399") + let contest_id = href.split('/').nth(2).unwrap_or("").to_string(); + + // Submission date + let Some(date_elem) = tds[offset].select(&time_selector).next() else { continue; // Skip if no date is found }; - - // Extract problem - let problem_selector = Selector::parse("td:nth-child(2) a") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let problem_element = row.select(&problem_selector).next(); - - let problem = if let Some(problem_elem) = problem_element { - // Extract problem ID from the URL (e.g., "/contests/abc399/tasks/abc399_a" -> "abc399_a") - if let Some(href) = problem_elem.value().attr("href") { - href.split('/').next_back().unwrap_or("").to_string() - } else { - continue; // Skip if no URL is found - } - } else { - continue; // Skip if no problem is found + let date_str = date_elem.text().collect::(); + // Parse the date string (e.g. "2024-04-05 12:34:56+0900") + let epoch_second = DateTime::parse_from_str(date_str.trim(), "%Y-%m-%d %H:%M:%S%z") + .map_err(|e| CrawlerError::ParseError(format!("Failed to parse date: {}", e)))? + .timestamp(); + + // Problem ID from the task link (e.g. "/contests/abc399/tasks/abc399_a" -> "abc399_a") + let problem = match tds[offset + 1] + .select(&a_selector) + .next() + .and_then(|a| a.value().attr("href")) + { + Some(href) => href.split('/').next_back().unwrap_or("").to_string(), + None => continue, // Skip if no problem is found }; - // Extract user - let user_selector = Selector::parse("td:nth-child(3) a:first-child") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let user_element = row.select(&user_selector).next(); - - let user = if let Some(user_elem) = user_element { - if let Some(href) = user_elem.value().attr("href") { - // Extract user_id from the URL path like "/users/{user_id}" - href.split('/') - .nth(2) // Get the third component after splitting (index 2) - .unwrap_or("") - .to_string() - } else { - continue; // Skip if no href is found - } - } else { - continue; // Skip if no user is found + // User ID from the first link in the cell (e.g. "/users/{user_id}") + let user = match tds[offset + 2] + .select(&a_selector) + .next() + .and_then(|a| a.value().attr("href")) + { + Some(href) => href.split('/').nth(2).unwrap_or("").to_string(), + None => continue, // Skip if no user is found }; - // Extract language - let language_selector = Selector::parse("td:nth-child(4) a") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let language_element = row.select(&language_selector).next(); - - let language = if let Some(language_elem) = language_element { - language_elem.text().collect::() - } else { + // Language + let Some(language_elem) = tds[offset + 3].select(&a_selector).next() else { continue; // Skip if no language is found }; - - // Extract score - let score_selector = Selector::parse("td:nth-child(5)") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let score_element = row.select(&score_selector).next(); - - let score = if let Some(score_elem) = score_element { - score_elem - .text() - .collect::() - .parse::() - .map_err(|e| CrawlerError::ParseError(format!("Failed to parse score: {}", e)))? - } else { - continue; // Skip if no score is found - }; - - // Extract code length - let code_length_selector = Selector::parse("td:nth-child(6)") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let code_length_element = row.select(&code_length_selector).next(); - - let code_length = if let Some(code_length_elem) = code_length_element { - let text = code_length_elem.text().collect::(); - // Remove " Byte" from the end and parse as u32 - text.trim_end_matches(" Byte").parse().map_err(|e| { + let language = language_elem.text().collect::(); + + // Score + let score = tds[offset + 4] + .text() + .collect::() + .trim() + .parse::() + .map_err(|e| CrawlerError::ParseError(format!("Failed to parse score: {}", e)))?; + + // Code length (e.g. "656 Byte") + let code_length = tds[offset + 5] + .text() + .collect::() + .trim() + .trim_end_matches(" Byte") + .parse() + .map_err(|e| { CrawlerError::ParseError(format!("Failed to parse code length: {}", e)) - })? - } else { - continue; // Skip if no code length is found - }; + })?; - // Extract result - let result_selector = Selector::parse("td:nth-child(7) span") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let result_element = row.select(&result_selector).next(); - - let result = if let Some(result_elem) = result_element { - result_elem.text().collect::() - } else { + // Result (e.g. "AC") + let Some(result_elem) = tds[offset + 6].select(&span_selector).next() else { continue; // Skip if no result is found }; + let result = result_elem.text().collect::(); - // Extract execution time - let execution_time_selector = Selector::parse("td:nth-child(8)") - .map_err(|e| CrawlerError::SelectorError(e.to_string()))?; - let execution_time_element = row.select(&execution_time_selector).next(); - - let execution_time = execution_time_element.and_then(|e| { - let text = e.text().collect::(); - text.trim_end_matches(" ms").parse::().ok() - }); - - // Get the URL from the details link - let url = if let Some(details_elem) = details_element { - if let Some(href) = details_elem.value().attr("href") { - href.to_string() - } else { - continue; // Skip if no URL is found - } - } else { - continue; // Skip if no details link is found + // Execution time (e.g. "479 ms"); absent for some results + let execution_time = { + let text = tds[offset + 7].text().collect::(); + text.trim().trim_end_matches(" ms").parse::().ok() }; - // Extract contest ID from the URL (e.g., "/contests/abc399/submissions/64188418" -> "abc399") - let contest_id = url - .split('/') - .nth(2) // Get the third component after splitting - .unwrap_or("") - .to_string(); - submissions.push(Submission { id, epoch_second, diff --git a/atcoder-problems-backend/crawler/tests/assets/submissions_rejudge.html b/atcoder-problems-backend/crawler/tests/assets/submissions_rejudge.html new file mode 100644 index 00000000..b38abf2e --- /dev/null +++ b/atcoder-problems-backend/crawler/tests/assets/submissions_rejudge.html @@ -0,0 +1,305 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Submission TimeTaskUserLanguageScoreCode SizeStatusExec TimeMemory
Ex - Make Qosakanishi Python (PyPy 3.11-v7.3.20)6252810 ByteAC1587 ms111904 KiB + Detail +
Ex - Make Qosakanishi Python (CPython 3.13.7)02810 ByteTLE> 4000 ms11396 KiB + Detail +
C - Standingsyuusya78 Python (CPython 3.13.7)0203 ByteWA406 ms64876 KiB + Detail +
E - MEXrestofwaterimp C# 13.0 (.NET 9.0.8)02799 ByteTLE> 2000 ms> 1048576 KiB + Detail +
B - Default PriceRyotaro45123 Python (PyPy 3.11-v7.3.20)200631 ByteAC55 ms80128 KiB + Detail +
C - Standingsokku_n9 C++23 (GCC 15.2.0)300883 ByteAC120 ms9724 KiB + Detail +
C - Standingsokku_n9 C++23 (GCC 15.2.0)0891 ByteWA129 ms9640 KiB + Detail +
F - Voucherstatami_ Rust (rustc 1.89.0)5001923 ByteAC73 ms40488 KiB + Detail +
F - Vouchersjim1424 C++23 (GCC 15.2.0)5002557 ByteAC292 ms20260 KiB + Detail +
B - Default Pricevjudge1 C++23 (Clang 21.1.0)0498 ByteRE102 ms2956 KiB + Detail +
D - Snuke Mazerestofwaterimp C# 13.0 (.NET 9.0.8)4003483 ByteAC50 ms30176 KiB + Detail +
B - Default Pricezhouhongyi C++23 (Clang 21.1.0)0498 ByteCE + Detail +
B - Default Pricevjudge1 C++ IOI-Style(GNU++20) (GCC 14.2.0)0627 ByteCE + Detail +
B - Default Pricevjudge1 C++ IOI-Style(GNU++20) (GCC 14.2.0)0483 ByteCE + Detail +
B - Default Pricemahuateng C++ IOI-Style(GNU++20) (GCC 14.2.0)0257 ByteWA0 ms1704 KiB + Detail +
E - MEXjim1424 C++23 (GCC 15.2.0)4752686 ByteAC65 ms79632 KiB + Detail +
D - Snuke Mazejim1424 C++23 (GCC 15.2.0)4002773 ByteAC16 ms14304 KiB + Detail +
C - Standingsjim1424 C++23 (GCC 15.2.0)3002334 ByteAC133 ms8236 KiB + Detail +
C - Standingsrestofwaterimp C# 13.0 (.NET 9.0.8)3001429 ByteAC246 ms41656 KiB + Detail +
C - Standingsrestofwaterimp C# 13.0 (.NET 9.0.8)01484 ByteWA161 ms45512 KiB + Detail +
+ diff --git a/atcoder-problems-backend/crawler/tests/test_parser.rs b/atcoder-problems-backend/crawler/tests/test_parser.rs index 17db5a37..7b8f41bc 100644 --- a/atcoder-problems-backend/crawler/tests/test_parser.rs +++ b/atcoder-problems-backend/crawler/tests/test_parser.rs @@ -140,6 +140,50 @@ fn test_parse_submissions_html() { } } +#[test] +fn test_parse_submissions_html_with_rejudge_column() { + // Contests the crawler account can rejudge (e.g. abc308, held jointly with a + // CodeQUEEN qualifier) render an extra leading checkbox column that shifts + // every other column. The parser must still extract every submission. + let html_content = include_str!("assets/submissions_rejudge.html"); + + let submissions = + parse_submissions_html(html_content).expect("Failed to parse submissions HTML"); + + assert_eq!( + submissions.len(), + 20, + "Expected 20 submissions, found {}", + submissions.len() + ); + + let first = &submissions[0]; + assert_eq!(first.id, 76670520); + assert_eq!(first.contest_id, "abc308"); + assert_eq!(first.problem_id, "abc308_h"); + assert_eq!(first.user, "osakanishi"); + assert_eq!(first.language, "Python (PyPy 3.11-v7.3.20)"); + assert_eq!(first.score, 625.); + assert_eq!(first.code_length, 2810); + assert_eq!(first.result, "AC"); + + let last = &submissions[19]; + assert_eq!(last.id, 76603243); + assert_eq!(last.problem_id, "abc308_c"); + assert_eq!(last.user, "restofwaterimp"); + assert_eq!(last.result, "WA"); + + for submission in &submissions { + assert!(submission.id > 0); + assert!(submission.epoch_second > 0); + assert_eq!(submission.contest_id, "abc308"); + assert!(!submission.problem_id.is_empty()); + assert!(!submission.user.is_empty()); + assert!(!submission.language.is_empty()); + assert!(!submission.result.is_empty()); + } +} + #[test] fn test_parse_tasks_html_abc308() { // Load the test HTML file