Skip to content

Commit

Permalink
Add column functions to check for valid date strings
Browse files Browse the repository at this point in the history
  • Loading branch information
ghanse committed Jan 29, 2025
1 parent daf37e4 commit 9cecb50
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 0 deletions.
30 changes: 30 additions & 0 deletions src/databricks/labs/dqx/col_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,33 @@ def is_not_null_and_not_empty_array(col_name: str) -> Column:
column = F.col(col_name)
condition = column.isNull() | (F.size(column) == 0)
return make_condition(condition, f"Column {col_name} is null or empty array", f"{col_name}_is_null_or_empty_array")


def is_valid_date(col_name: str, date_format: str = "yyyy-MM-dd") -> Column:
"""
Creates a condition column to check if a string is a valid date.
:param col_name: column name to check
:param date_format: date format (e.g. 'yyyy-mm-dd')
:return: Column object for condition
"""
column = F.col(col_name)
condition = F.to_date(column, date_format).isNull()
return make_condition(
condition, f"Value '{column}' is not a valid date with format '{date_format}'", f"{col_name}_is_not_valid_date"
)


def is_valid_timestamp(col_name: str, timestamp_format: str = "yyyy-MM-dd HH:mm:ss") -> Column:
"""
Creates a condition column to check if a string is a valid date.
:param col_name: column name to check
:param timestamp_format: timestamp format (e.g. 'yyyy-mm-dd HH:mm:ss')
:return: Column object for condition
"""
column = F.col(col_name)
condition = F.to_timestamp(column, timestamp_format).isNull()
return make_condition(
condition,
f"Value '{column}' is not a valid timestamp with format '{timestamp_format}'",
f"{col_name}_is_not_valid_timestamp",
)
84 changes: 84 additions & 0 deletions tests/integration/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
value_is_in_list,
value_is_not_null_and_is_in_list,
is_not_null_and_not_empty_array,
is_valid_date,
)

SCHEMA = "a: string, b: int"
Expand Down Expand Up @@ -482,3 +483,86 @@ def test_col_is_not_null_and_not_empty_array(spark):
expected = spark.createDataFrame(checked_data, checked_schema)

assert_df_equality(actual, expected, ignore_nullable=True)


def test_col_is_valid_date(spark):
schema_array = "a: string, b: string, c: string, d: string"
data = [
["2024-01-01", "12/31/2025", "invalid_date", None],
["12/31/2025", "2024-01-01", "invalid_date", None],
["12/31/2025", "invalid_date", "2024-01-01", None],
]

test_df = spark.createDataFrame(data, schema_array)

actual = test_df.select(
is_valid_date("a"), is_valid_date("b", "MM/dd/yyyy"), is_valid_date("c", "yyyy-MM-dd"), is_valid_date("d")
)

checked_schema = """
a_is_not_valid_date: string,
b_is_not_valid_date: string,
c_is_not_valid_date: string,
d_is_not_valid_date: string
"""
checked_data = [
[None, None, "Value 'invalid_date' is not a valid date with format 'yyyy-MM-dd'", None],
[
"Value '12/31/2025' is not a valid date with format 'yyyy-MM-dd'",
"Value '2024-01-01' is not a valid date with format 'MM/dd/yyyy'",
"Value 'invalid_date' is not a valid date with format 'yyyy-MM-dd'",
None,
],
[
"Value '12/31/2025' is not a valid date with format 'yyyy-MM-dd'",
"Value 'invalid_date' is not a valid date with format 'MM/dd/yyyy'",
None,
None,
],
]
expected = spark.createDataFrame(checked_data, checked_schema)

assert_df_equality(actual, expected, ignore_nullable=True)


def test_col_is_valid_timestamp(spark):
schema_array = "a: string, b: string, c: string, d: string"
data = [
["2024-01-01 00:00:00", "12/31/2025 00:00:00", "invalid_timestamp", None],
["12/31/2025 00:00:00", "2024-01-01 00:00:00", "invalid_timestamp", None],
["12/31/2025 00:00:00", "invalid_timestamp", "2024-01-01 00:00:00", None],
]

test_df = spark.createDataFrame(data, schema_array)

actual = test_df.select(
is_valid_date("a"),
is_valid_date("b", "MM/dd/yyyy HH:mm:ss"),
is_valid_date("c", "yyyy-MM-dd HH:mm:ss"),
is_valid_date("d"),
)

checked_schema = """
a_is_not_valid_timestamp: string,
b_is_not_valid_timestamp: string,
c_is_not_valid_timestamp: string,
d_is_not_valid_timestamp: string
"""
checked_data = [
[None, None, "Value 'invalid_timestamp' is not a valid timestamp with format 'yyyy-MM-dd HH:mm:ss'", None],
[
"Value '12/31/2025 00:00:00' is not a valid timestamp with format 'yyyy-MM-dd HH:mm:ss'",
"Value '2024-01-01 00:00:00' is not a valid timestamp with format 'MM/dd/yyyy HH:mm:ss'",
"Value 'invalid_timestamp' is not a valid timestamp with format 'yyyy-MM-dd HH:mm:ss'",
None,
],
[
"Value '12/31/2025 00:00:00' is not a valid timestamp with format 'yyyy-MM-dd HH:mm:ss'",
"Value 'invalid_timestamp' is not a valid timestamp with format 'yyyy-MM-dd HH:mm:ss'",
None,
None,
],
]
expected = spark.createDataFrame(checked_data, checked_schema)

assert_df_equality(actual, expected, ignore_nullable=True)

0 comments on commit 9cecb50

Please sign in to comment.