From 3552f8814841cda6b52d3ce7ab0a4742b421603c Mon Sep 17 00:00:00 2001 From: NewdlDewdl Date: Tue, 3 Mar 2026 21:38:10 -0600 Subject: [PATCH 1/4] test: add unit test methodology for scrapers with fixtures Addresses #46 by establishing a fixture-based testing approach for scrapers that avoids live network dependencies. Changes: - Extract testable helper functions from cometCalendar and coursebook scrapers - Add comprehensive unit tests covering HTTP request behavior, parsing logic, and caching - Create testdata/ directory with sample API responses and HTML fixtures - Document methodology in README: use fixtures + httptest.Server for deterministic tests Tests verify: - HTTP request construction (query params, headers) - Response parsing and error handling - Section ID extraction from HTML - File-based caching and missing ID filtering - Event time/location/filter parsing All tests pass and are fast (<1s) with no external dependencies. --- README.md | 11 + scrapers/cometCalendar.go | 13 +- scrapers/cometCalendar_test.go | 220 ++++++++++++++++++ scrapers/coursebook.go | 18 +- scrapers/coursebook_test.go | 81 +++++++ scrapers/testdata/cometCalendar/page0.json | 40 ++++ .../coursebook/search-results-sample.html | 9 + 7 files changed, 382 insertions(+), 10 deletions(-) create mode 100644 scrapers/cometCalendar_test.go create mode 100644 scrapers/coursebook_test.go create mode 100644 scrapers/testdata/cometCalendar/page0.json create mode 100644 scrapers/testdata/coursebook/search-results-sample.html diff --git a/README.md b/README.md index 0142f69..785615a 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,17 @@ Additionally, you can run build (on Windows) and make (on MacOS/Linux) with the - `test`: Test run to see if the executable works after building - `build`: Builds the executble and makes it ready for use. +### Scraper Unit Testing Methodology + +Scrapers depend on external websites/APIs, so unit tests should avoid live network calls and instead replay representative samples. + +- Store captured payloads/HTML fixtures in `scrapers/testdata/`. +- Keep request parsing/normalization logic in helper functions that can be invoked directly from tests. +- For HTTP scrapers, use `httptest.Server` to serve fixture payloads and validate request query/header behavior. +- Keep end-to-end scraping against real sources as manual/integration checks, not unit tests. + +This keeps tests deterministic, fast, and resilient to external website changes while still validating scraper logic. + ### Usage The `api-tools` command line interface supports three main modes: scraping, parsing and uploading data to the Nebula API. diff --git a/scrapers/cometCalendar.go b/scrapers/cometCalendar.go index b79a28a..d2d5d2f 100644 --- a/scrapers/cometCalendar.go +++ b/scrapers/cometCalendar.go @@ -131,11 +131,15 @@ func ScrapeCometCalendar(outDir string) { log.Printf("Finished scraping %d events successfully!\n\n", len(calendarEvents)) } -// callAndUnmarshal fetches a calendar page and decodes it into data. +// callAndUnmarshal fetches a calendar page from the production API and decodes it into data. func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { - // Call API to get the byte data - calendarUrl := fmt.Sprintf("%s?days=365&pp=100&page=%d", COMET_CALENDAR_URL, page) - request, err := http.NewRequest("GET", calendarUrl, nil) + return callAndUnmarshalFromURL(client, COMET_CALENDAR_URL, page, data) +} + +// callAndUnmarshalFromURL fetches a calendar page from baseURL and decodes it into data. +func callAndUnmarshalFromURL(client *http.Client, baseURL string, page int, data *APICalendarResponse) error { + calendarURL := fmt.Sprintf("%s?days=365&pp=100&page=%d", baseURL, page) + request, err := http.NewRequest("GET", calendarURL, nil) if err != nil { return err } @@ -153,7 +157,6 @@ func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) } defer response.Body.Close() - // Unmarshal bytes to the response data buffer := bytes.Buffer{} if _, err = buffer.ReadFrom(response.Body); err != nil { return err diff --git a/scrapers/cometCalendar_test.go b/scrapers/cometCalendar_test.go new file mode 100644 index 0000000..c992000 --- /dev/null +++ b/scrapers/cometCalendar_test.go @@ -0,0 +1,220 @@ +package scrapers + +import ( + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + "time" +) + +func TestCallAndUnmarshalFromURL_Success(t *testing.T) { + t.Parallel() + + payload, err := os.ReadFile(filepath.Join("testdata", "cometCalendar", "page0.json")) + if err != nil { + t.Fatalf("failed to load fixture: %v", err) + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := r.URL.Query().Get("days"); got != "365" { + t.Errorf("expected query days=365, got %q", got) + } + if got := r.URL.Query().Get("pp"); got != "100" { + t.Errorf("expected query pp=100, got %q", got) + } + if got := r.URL.Query().Get("page"); got != "2" { + t.Errorf("expected query page=2, got %q", got) + } + if got := r.Header.Get("Accept"); got != "application/json" { + t.Errorf("expected Accept header application/json, got %q", got) + } + if got := r.Header.Get("Content-type"); got != "application/json" { + t.Errorf("expected Content-type header application/json, got %q", got) + } + + w.Header().Set("Content-Type", "application/json") + if _, err := w.Write(payload); err != nil { + t.Errorf("failed to write fixture response: %v", err) + } + })) + defer server.Close() + + client := http.Client{Timeout: 2 * time.Second} + var calendarData APICalendarResponse + + if err := callAndUnmarshalFromURL(&client, server.URL, 2, &calendarData); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + if got := calendarData.Page["total"]; got != 3 { + t.Fatalf("expected page.total=3, got %d", got) + } + + if len(calendarData.Events) != 1 { + t.Fatalf("expected 1 event, got %d", len(calendarData.Events)) + } + + event := calendarData.Events[0].Event + if event.Title != "Nebula Testing Workshop" { + t.Errorf("expected title %q, got %q", "Nebula Testing Workshop", event.Title) + } + if event.Custom_fields.Contact_information_email != "team@utdnebula.com" { + t.Errorf("expected contact email %q, got %q", "team@utdnebula.com", event.Custom_fields.Contact_information_email) + } +} + +func TestCallAndUnmarshalFromURL_Non200(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer server.Close() + + client := http.Client{Timeout: 2 * time.Second} + var calendarData APICalendarResponse + + if err := callAndUnmarshalFromURL(&client, server.URL, 1, &calendarData); err == nil { + t.Fatal("expected an error for non-200 response, got nil") + } +} + +func TestGetTime(t *testing.T) { + t.Parallel() + + testCases := map[string]struct { + Start string + End string + Err bool + }{ + "start_and_end": { + Start: "2026-03-01T12:00:00-06:00", + End: "2026-03-01T13:30:00-06:00", + }, + "missing_end_uses_start": { + Start: "2026-03-01T12:00:00-06:00", + End: "", + }, + "invalid_start": { + Start: "not-a-time", + End: "2026-03-01T13:30:00-06:00", + Err: true, + }, + "invalid_end": { + Start: "2026-03-01T12:00:00-06:00", + End: "still-not-a-time", + Err: true, + }, + } + + for name, testCase := range testCases { + t.Run(name, func(t *testing.T) { + t.Parallel() + + event := eventWithTimes(testCase.Start, testCase.End) + start, end, err := getTime(event) + + if testCase.Err { + if err == nil { + t.Fatal("expected error, got nil") + } + return + } + + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + expectedStart, _ := time.Parse(time.RFC3339, testCase.Start) + if !start.Equal(expectedStart) { + t.Errorf("unexpected start time: got %v, expected %v", start, expectedStart) + } + + expectedEnd := expectedStart + if testCase.End != "" { + expectedEnd, _ = time.Parse(time.RFC3339, testCase.End) + } + if !end.Equal(expectedEnd) { + t.Errorf("unexpected end time: got %v, expected %v", end, expectedEnd) + } + }) + } +} + +func TestGetEventLocation(t *testing.T) { + t.Parallel() + + testCases := map[string]struct { + Event Event + Expected string + }{ + "building_and_room": { + Event: Event{Location_name: "SSA", Room_number: "2.406"}, + Expected: "SSA, 2.406", + }, + "building_only": { + Event: Event{Location_name: "SSA", Room_number: ""}, + Expected: "SSA", + }, + "room_only": { + Event: Event{Location_name: "", Room_number: "2.406"}, + Expected: "2.406", + }, + } + + for name, testCase := range testCases { + t.Run(name, func(t *testing.T) { + t.Parallel() + + if got := getEventLocation(testCase.Event); got != testCase.Expected { + t.Errorf("expected %q, got %q", testCase.Expected, got) + } + }) + } +} + +func TestGetFiltersAndDepartments(t *testing.T) { + t.Parallel() + + event := Event{ + Filters: Filters{ + Event_types: []FilterMap{{Name: "Workshop"}, {Name: "Networking"}}, + Event_target_audience: []FilterMap{{Name: "Students"}}, + Event_topic: []FilterMap{{Name: "Technology"}, {Name: "Career"}}, + }, + Departments: []FilterMap{{Name: "Engineering"}, {Name: "Career Center"}}, + } + + types, audiences, topics := getFilters(event) + if len(types) != 2 || types[0] != "Workshop" || types[1] != "Networking" { + t.Errorf("unexpected event types: %v", types) + } + if len(audiences) != 1 || audiences[0] != "Students" { + t.Errorf("unexpected audiences: %v", audiences) + } + if len(topics) != 2 || topics[0] != "Technology" || topics[1] != "Career" { + t.Errorf("unexpected topics: %v", topics) + } + + departments := getDepartments(event) + if len(departments) != 2 || departments[0] != "Engineering" || departments[1] != "Career Center" { + t.Errorf("unexpected departments: %v", departments) + } +} + +func eventWithTimes(start string, end string) Event { + return Event{ + Event_instances: []struct { + Event_instance EventInstance `json:"event_instance"` + }{ + { + Event_instance: EventInstance{ + Start: start, + End: end, + }, + }, + }, + } +} diff --git a/scrapers/coursebook.go b/scrapers/coursebook.go index 4f6119c..1f0c0c2 100644 --- a/scrapers/coursebook.go +++ b/scrapers/coursebook.go @@ -268,17 +268,25 @@ func (s *coursebookScraper) getSectionIdsForPrefix(prefix string) ([]string, err if err != nil { return nil, fmt.Errorf("failed to fetch sections: %s", err) } - sectionRegexp := utils.Regexpf(`View details for section (%s%s\.\w+\.%s)`, prefix[3:], utils.R_COURSE_CODE, utils.R_TERM_CODE) - matches := sectionRegexp.FindAllStringSubmatch(content, -1) - for _, match := range matches { - sections = append(sections, match[1]) - } + sections = append(sections, extractSectionIDs(prefix, content)...) } s.prefixIdsCache[prefix] = sections return sections, nil } +// extractSectionIDs parses search response content and returns all matched section IDs. +func extractSectionIDs(prefix string, content string) []string { + sectionRegexp := utils.Regexpf(`View details for section (%s%s\.\w+\.%s)`, prefix[3:], utils.R_COURSE_CODE, utils.R_TERM_CODE) + matches := sectionRegexp.FindAllStringSubmatch(content, -1) + + sections := make([]string, 0, len(matches)) + for _, match := range matches { + sections = append(sections, match[1]) + } + return sections +} + // req utility function for making calling the coursebook api func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (string, error) { var res *http.Response diff --git a/scrapers/coursebook_test.go b/scrapers/coursebook_test.go new file mode 100644 index 0000000..fcc84d7 --- /dev/null +++ b/scrapers/coursebook_test.go @@ -0,0 +1,81 @@ +package scrapers + +import ( + "os" + "path/filepath" + "reflect" + "testing" +) + +func TestExtractSectionIDs_FromFixture(t *testing.T) { + t.Parallel() + + content, err := os.ReadFile(filepath.Join("testdata", "coursebook", "search-results-sample.html")) + if err != nil { + t.Fatalf("failed to load fixture: %v", err) + } + + ids := extractSectionIDs("cp_acct", string(content)) + expected := []string{ + "acct2301.001.25S", + "acct2301.002.25S", + "acct6v01.0W1.25S", + } + + if !reflect.DeepEqual(expected, ids) { + t.Errorf("unexpected ids. expected %v, got %v", expected, ids) + } +} + +func TestGetMissingIdsForPrefix_NoDirectory(t *testing.T) { + t.Parallel() + + ids := []string{"acct2301.001.25S", "acct2301.002.25S"} + scraper := &coursebookScraper{ + term: "25S", + outDir: t.TempDir(), + prefixIdsCache: map[string][]string{"cp_acct": ids}, + } + + missing, err := scraper.getMissingIdsForPrefix("cp_acct") + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + if !reflect.DeepEqual(ids, missing) { + t.Errorf("expected all ids to be missing. expected %v, got %v", ids, missing) + } +} + +func TestGetMissingIdsForPrefix_FiltersExistingFiles(t *testing.T) { + t.Parallel() + + ids := []string{"acct2301.001.25S", "acct2301.002.25S", "acct2301.003.25S"} + outDir := t.TempDir() + prefixDir := filepath.Join(outDir, "25S", "cp_acct") + + if err := os.MkdirAll(prefixDir, 0755); err != nil { + t.Fatalf("failed to create prefix directory: %v", err) + } + + existing := "acct2301.002.25S" + if err := os.WriteFile(filepath.Join(prefixDir, existing+".html"), []byte("cached"), 0644); err != nil { + t.Fatalf("failed to seed existing section file: %v", err) + } + + scraper := &coursebookScraper{ + term: "25S", + outDir: outDir, + prefixIdsCache: map[string][]string{"cp_acct": ids}, + } + + missing, err := scraper.getMissingIdsForPrefix("cp_acct") + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + expected := []string{"acct2301.001.25S", "acct2301.003.25S"} + if !reflect.DeepEqual(expected, missing) { + t.Errorf("unexpected missing ids. expected %v, got %v", expected, missing) + } +} diff --git a/scrapers/testdata/cometCalendar/page0.json b/scrapers/testdata/cometCalendar/page0.json new file mode 100644 index 0000000..a7a1bf6 --- /dev/null +++ b/scrapers/testdata/cometCalendar/page0.json @@ -0,0 +1,40 @@ +{ + "events": [ + { + "event": { + "title": "Nebula Testing Workshop", + "url": "https://calendar.utdallas.edu/event/nebula_testing_workshop", + "room_number": "2.406", + "location_name": "SSA", + "tags": ["Engineering", "Career"], + "description_text": "Learn how we test scrapers with fixtures.", + "event_instances": [ + { + "event_instance": { + "start": "2026-03-01T12:00:00-06:00", + "end": "2026-03-01T13:00:00-06:00" + } + } + ], + "filters": { + "event_target_audience": [{ "name": "Students", "id": 1 }], + "event_topic": [{ "name": "Technology", "id": 2 }], + "event_types": [{ "name": "Workshop", "id": 3 }] + }, + "custom_fields": { + "contact_information_name": "Nebula Team", + "contact_information_email": "team@utdnebula.com", + "contact_information_phone": "972-000-0000" + }, + "departments": [{ "name": "Engineering", "id": 11 }] + } + } + ], + "page": { + "total": 3 + }, + "date": { + "from": "2026-03-01", + "to": "2027-03-01" + } +} diff --git a/scrapers/testdata/coursebook/search-results-sample.html b/scrapers/testdata/coursebook/search-results-sample.html new file mode 100644 index 0000000..bf384a4 --- /dev/null +++ b/scrapers/testdata/coursebook/search-results-sample.html @@ -0,0 +1,9 @@ + + +
View details for section acct2301.001.25S
+
View details for section acct2301.002.25S
+
View details for section acct6v01.0W1.25S
+
View details for section cs1337.001.25S
+
This line should not match
+ + From d9c5f7b5d626c4550f5b2da8b7b01b38143d58d4 Mon Sep 17 00:00:00 2001 From: NewdlDewdl Date: Tue, 3 Mar 2026 21:42:10 -0600 Subject: [PATCH 2/4] test: harden scraper unit tests with negative cases --- scrapers/cometCalendar_test.go | 19 +++++++++++++++++++ scrapers/coursebook_test.go | 11 +++++++++++ 2 files changed, 30 insertions(+) diff --git a/scrapers/cometCalendar_test.go b/scrapers/cometCalendar_test.go index c992000..2ac9d9c 100644 --- a/scrapers/cometCalendar_test.go +++ b/scrapers/cometCalendar_test.go @@ -81,6 +81,25 @@ func TestCallAndUnmarshalFromURL_Non200(t *testing.T) { } } +func TestCallAndUnmarshalFromURL_InvalidJSON(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + if _, err := w.Write([]byte(`{"events":[`)); err != nil { + t.Fatalf("failed to write invalid response: %v", err) + } + })) + defer server.Close() + + client := http.Client{Timeout: 2 * time.Second} + var calendarData APICalendarResponse + + if err := callAndUnmarshalFromURL(&client, server.URL, 1, &calendarData); err == nil { + t.Fatal("expected json unmarshal error, got nil") + } +} + func TestGetTime(t *testing.T) { t.Parallel() diff --git a/scrapers/coursebook_test.go b/scrapers/coursebook_test.go index fcc84d7..8de371a 100644 --- a/scrapers/coursebook_test.go +++ b/scrapers/coursebook_test.go @@ -27,6 +27,17 @@ func TestExtractSectionIDs_FromFixture(t *testing.T) { } } +func TestExtractSectionIDs_NoMatches(t *testing.T) { + t.Parallel() + + content := `
View details for section cs1337.001.25S
` + ids := extractSectionIDs("cp_acct", content) + + if len(ids) != 0 { + t.Errorf("expected no ids, got %v", ids) + } +} + func TestGetMissingIdsForPrefix_NoDirectory(t *testing.T) { t.Parallel() From b31245323d085765cf7c1ebe9f89bdea43823a89 Mon Sep 17 00:00:00 2001 From: NewdlDewdl Date: Tue, 3 Mar 2026 21:46:35 -0600 Subject: [PATCH 3/4] docs(test): define scraper sample criteria and cover empty payload --- README.md | 5 ++++ scrapers/cometCalendar_test.go | 30 +++++++++++++++++++ .../testdata/cometCalendar/page-empty.json | 10 +++++++ 3 files changed, 45 insertions(+) create mode 100644 scrapers/testdata/cometCalendar/page-empty.json diff --git a/README.md b/README.md index 785615a..81df56b 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,11 @@ Scrapers depend on external websites/APIs, so unit tests should avoid live netwo - For HTTP scrapers, use `httptest.Server` to serve fixture payloads and validate request query/header behavior. - Keep end-to-end scraping against real sources as manual/integration checks, not unit tests. +When choosing scraper samples, include at least: +- A representative success payload from production-like data. +- A sparse or empty payload (to verify no-data handling). +- A malformed payload or non-200 response (to verify error handling). + This keeps tests deterministic, fast, and resilient to external website changes while still validating scraper logic. ### Usage diff --git a/scrapers/cometCalendar_test.go b/scrapers/cometCalendar_test.go index 2ac9d9c..f67de25 100644 --- a/scrapers/cometCalendar_test.go +++ b/scrapers/cometCalendar_test.go @@ -65,6 +65,36 @@ func TestCallAndUnmarshalFromURL_Success(t *testing.T) { } } +func TestCallAndUnmarshalFromURL_EmptyPayload(t *testing.T) { + t.Parallel() + + payload, err := os.ReadFile(filepath.Join("testdata", "cometCalendar", "page-empty.json")) + if err != nil { + t.Fatalf("failed to load empty fixture: %v", err) + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + if _, err := w.Write(payload); err != nil { + t.Fatalf("failed to write fixture response: %v", err) + } + })) + defer server.Close() + + client := http.Client{Timeout: 2 * time.Second} + var calendarData APICalendarResponse + + if err := callAndUnmarshalFromURL(&client, server.URL, 1, &calendarData); err != nil { + t.Fatalf("expected no error, got %v", err) + } + if got := len(calendarData.Events); got != 0 { + t.Fatalf("expected 0 events, got %d", got) + } + if got := calendarData.Page["total"]; got != 0 { + t.Fatalf("expected page.total=0, got %d", got) + } +} + func TestCallAndUnmarshalFromURL_Non200(t *testing.T) { t.Parallel() diff --git a/scrapers/testdata/cometCalendar/page-empty.json b/scrapers/testdata/cometCalendar/page-empty.json new file mode 100644 index 0000000..5422f9c --- /dev/null +++ b/scrapers/testdata/cometCalendar/page-empty.json @@ -0,0 +1,10 @@ +{ + "events": [], + "page": { + "total": 0 + }, + "date": { + "from": "2026-03-01", + "to": "2027-03-01" + } +} From c364939f39a552877700377f82b28337f790ad4c Mon Sep 17 00:00:00 2001 From: NewdlDewdl Date: Tue, 3 Mar 2026 22:22:40 -0600 Subject: [PATCH 4/4] docs: remove scraper testing section from README --- README.md | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/README.md b/README.md index 81df56b..0142f69 100644 --- a/README.md +++ b/README.md @@ -40,22 +40,6 @@ Additionally, you can run build (on Windows) and make (on MacOS/Linux) with the - `test`: Test run to see if the executable works after building - `build`: Builds the executble and makes it ready for use. -### Scraper Unit Testing Methodology - -Scrapers depend on external websites/APIs, so unit tests should avoid live network calls and instead replay representative samples. - -- Store captured payloads/HTML fixtures in `scrapers/testdata/`. -- Keep request parsing/normalization logic in helper functions that can be invoked directly from tests. -- For HTTP scrapers, use `httptest.Server` to serve fixture payloads and validate request query/header behavior. -- Keep end-to-end scraping against real sources as manual/integration checks, not unit tests. - -When choosing scraper samples, include at least: -- A representative success payload from production-like data. -- A sparse or empty payload (to verify no-data handling). -- A malformed payload or non-200 response (to verify error handling). - -This keeps tests deterministic, fast, and resilient to external website changes while still validating scraper logic. - ### Usage The `api-tools` command line interface supports three main modes: scraping, parsing and uploading data to the Nebula API.