From fbab774a5ecb57edc3a15ee7d296d169bff35e64 Mon Sep 17 00:00:00 2001 From: "ram.yamasani@reachlocal.com" Date: Fri, 22 May 2026 10:20:23 -0700 Subject: [PATCH 1/3] DBE-3763 Fix for known issues --- data_diff/databases/mysql.py | 3 +++ data_diff/utils.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py index 1ee04460b..72665deee 100644 --- a/data_diff/databases/mysql.py +++ b/data_diff/databases/mysql.py @@ -8,6 +8,7 @@ Float, Decimal, Integer, + JSON, Text, TemporalType, FractionalType, @@ -68,6 +69,8 @@ class Dialect(BaseDialect): "tinytext": Text, # Boolean "boolean": Boolean, + # JSON + "json": JSON, } def quote(self, s: str) -> str: diff --git a/data_diff/utils.py b/data_diff/utils.py index 1d1405fdb..5c0ab2d0b 100644 --- a/data_diff/utils.py +++ b/data_diff/utils.py @@ -513,6 +513,13 @@ def diff_int_dynamic_color_template(diff_value: int) -> str: def _jsons_equiv(a: str, b: str): + # Treat Python None (DB null) as the JSON null literal so that a NULL on + # the MySQL side matches a 'null' string produced by TO_JSON_STRING(NULL) + # on the BigQuery side (or any other DB that serializes NULL as 'null'). + if a is None: + a = "null" + if b is None: + b = "null" try: return json.loads(a) == json.loads(b) except (ValueError, TypeError, json.decoder.JSONDecodeError): # not valid jsons From 15a6f81dbfec63201e02cbe10007413691744bbb Mon Sep 17 00:00:00 2001 From: "ram.yamasani@reachlocal.com" Date: Tue, 26 May 2026 13:59:19 -0700 Subject: [PATCH 2/3] Fix for '0000-00-00 00:00:00' mysql timestamp value --- data_diff/databases/mysql.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py index 72665deee..b3e7e8054 100644 --- a/data_diff/databases/mysql.py +++ b/data_diff/databases/mysql.py @@ -109,6 +109,15 @@ def md5_as_hex(self, s: str) -> str: return f"md5({s})" def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: + # MySQL zero-date equivalences vs BigQuery: + # TIMESTAMP '0000-00-00 00:00:00' -> '1970-01-01 00:00:00.000000' (Unix epoch) + # DATETIME '0000-00-00 00:00:00' -> NULL + if isinstance(coltype, Timestamp): + epoch = "cast('1970-01-01 00:00:00' as datetime(6))" + value = f"IF({value} = '0000-00-00 00:00:00', {epoch}, {value})" + elif isinstance(coltype, Datetime): + value = f"NULLIF({value}, '0000-00-00 00:00:00')" + if coltype.rounds: return self.to_string(f"cast( cast({value} as datetime({coltype.precision})) as datetime(6))") From 10e19a2a52fe9e1ef3c7a5a47929d50e3219d23e Mon Sep 17 00:00:00 2001 From: Ram Yamasani Date: Tue, 26 May 2026 14:14:39 -0700 Subject: [PATCH 3/3] Update mysql.py epoch timestamp value testing --- data_diff/databases/mysql.py | 9 -------- data_diff/utils.py | 7 ++++-- pyproject.toml | 4 ++-- tests/test_utils.py | 41 ++++++++++++++++++++++++++++++++++++ uv.lock | 4 ++-- 5 files changed, 50 insertions(+), 15 deletions(-) diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py index b3e7e8054..72665deee 100644 --- a/data_diff/databases/mysql.py +++ b/data_diff/databases/mysql.py @@ -109,15 +109,6 @@ def md5_as_hex(self, s: str) -> str: return f"md5({s})" def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: - # MySQL zero-date equivalences vs BigQuery: - # TIMESTAMP '0000-00-00 00:00:00' -> '1970-01-01 00:00:00.000000' (Unix epoch) - # DATETIME '0000-00-00 00:00:00' -> NULL - if isinstance(coltype, Timestamp): - epoch = "cast('1970-01-01 00:00:00' as datetime(6))" - value = f"IF({value} = '0000-00-00 00:00:00', {epoch}, {value})" - elif isinstance(coltype, Datetime): - value = f"NULLIF({value}, '0000-00-00 00:00:00')" - if coltype.rounds: return self.to_string(f"cast( cast({value} as datetime({coltype.precision})) as datetime(6))") diff --git a/data_diff/utils.py b/data_diff/utils.py index 5c0ab2d0b..1ad9fbac2 100644 --- a/data_diff/utils.py +++ b/data_diff/utils.py @@ -512,7 +512,7 @@ def diff_int_dynamic_color_template(diff_value: int) -> str: return "0" -def _jsons_equiv(a: str, b: str): +def _jsons_equiv(a: Optional[str], b: Optional[str]): # Treat Python None (DB null) as the JSON null literal so that a NULL on # the MySQL side matches a 'null' string produced by TO_JSON_STRING(NULL) # on the BigQuery side (or any other DB that serializes NULL as 'null'). @@ -520,9 +520,12 @@ def _jsons_equiv(a: str, b: str): a = "null" if b is None: b = "null" + # Fast-path: identical strings don't need JSON parsing. + if a == b: + return True try: return json.loads(a) == json.loads(b) - except (ValueError, TypeError, json.decoder.JSONDecodeError): # not valid jsons + except (ValueError, TypeError): # covers json.JSONDecodeError (subclass of ValueError) return False diff --git a/pyproject.toml b/pyproject.toml index dd2b05237..cbc3aa75e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ [project.optional-dependencies] preql = ["preql>=0.2.19"] -mysql = ["mysql-connector-python==8.0.29"] +mysql = ["mysql-connector-python>=8.0.29"] postgresql = ["psycopg2"] redshift = ["psycopg2"] snowflake = ["snowflake-connector-python>=3.0.2,<4.0.0", "cryptography"] @@ -54,7 +54,7 @@ duckdb = ["duckdb"] bigquery = ["google-cloud-bigquery"] all-dbs = [ "preql>=0.2.19", - "mysql-connector-python==8.0.29", + "mysql-connector-python>=8.0.29", "psycopg2", "snowflake-connector-python>=3.0.2,<4.0.0", "cryptography", diff --git a/tests/test_utils.py b/tests/test_utils.py index 712f3467c..613c0b032 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,6 +11,7 @@ columns_removed_template, columns_added_template, columns_type_changed_template, + _jsons_equiv, ) from data_diff.__main__ import _remove_passwords_in_dict @@ -211,3 +212,43 @@ def test_columns_type_changed_template(self): output = columns_type_changed_template({"column1", "column2"}) self.assertIn("Type changed [2]: [green]", output) self.assertEqual(self.extract_columns_set(output), {"column1", "column2"}) + + +class TestJsonsEquiv(unittest.TestCase): + # --- None / null equivalence --- + def test_both_none(self): + """Two DB NULLs are equivalent.""" + self.assertTrue(_jsons_equiv(None, None)) + + def test_none_vs_json_null_string(self): + """DB NULL on one side, JSON 'null' string on the other, are equivalent.""" + self.assertTrue(_jsons_equiv(None, "null")) + self.assertTrue(_jsons_equiv("null", None)) + + def test_none_vs_json_string_null(self): + """DB NULL must NOT equal the JSON string literal \"null\".""" + self.assertFalse(_jsons_equiv(None, '"null"')) + self.assertFalse(_jsons_equiv('"null"', None)) + + # --- Identical strings fast-path --- + def test_identical_strings(self): + self.assertTrue(_jsons_equiv('{"a": 1}', '{"a": 1}')) + + # --- Semantic JSON equivalence --- + def test_equivalent_objects_different_whitespace(self): + self.assertTrue(_jsons_equiv('{"a":1,"b":2}', '{"b": 2, "a": 1}')) + + def test_equivalent_arrays(self): + self.assertTrue(_jsons_equiv("[1, 2, 3]", "[1,2,3]")) + + def test_different_values(self): + self.assertFalse(_jsons_equiv('{"a": 1}', '{"a": 2}')) + + def test_different_types(self): + self.assertFalse(_jsons_equiv("1", '"1"')) + + # --- Invalid JSON --- + def test_invalid_json_returns_false(self): + # Different invalid-JSON strings → False (can't parse either side) + self.assertFalse(_jsons_equiv("not-json", "also-not-json")) + self.assertFalse(_jsons_equiv('{"a": 1}', "not-json")) diff --git a/uv.lock b/uv.lock index eff8aea1a..f367694db 100644 --- a/uv.lock +++ b/uv.lock @@ -949,8 +949,8 @@ requires-dist = [ { name = "google-cloud-bigquery", marker = "extra == 'bigquery'" }, { name = "keyring" }, { name = "mashumaro", extras = ["msgpack"], specifier = ">=2.9,<3.11.0" }, - { name = "mysql-connector-python", marker = "extra == 'all-dbs'", specifier = "==8.0.29" }, - { name = "mysql-connector-python", marker = "extra == 'mysql'", specifier = "==8.0.29" }, + { name = "mysql-connector-python", marker = "extra == 'all-dbs'", specifier = ">=8.0.29" }, + { name = "mysql-connector-python", marker = "extra == 'mysql'", specifier = ">=8.0.29" }, { name = "oracledb", marker = "extra == 'all-dbs'" }, { name = "oracledb", marker = "extra == 'oracle'" }, { name = "preql", marker = "extra == 'all-dbs'", specifier = ">=0.2.19" },