From cb107af7f74737213e406f266292a374cab89a9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 6 May 2026 15:16:18 -0500 Subject: [PATCH 1/5] chore: add contributing guidelines for bigframes.bigquery Add contributing guidelines for bigframes.bigquery APIs, detailing input and output policies, naming conventions, and examples for SQL operations. --- .../specs/bigframes-bigquery-contributing.md | 508 ++++++++++++++++++ 1 file changed, 508 insertions(+) create mode 100644 packages/bigframes/specs/bigframes-bigquery-contributing.md diff --git a/packages/bigframes/specs/bigframes-bigquery-contributing.md b/packages/bigframes/specs/bigframes-bigquery-contributing.md new file mode 100644 index 000000000000..188f9fb56710 --- /dev/null +++ b/packages/bigframes/specs/bigframes-bigquery-contributing.md @@ -0,0 +1,508 @@ +# bigframes.bigquery inputs and outputs policies + +The goal of the [bigframes.bigquery APIs](https://dataframes.bigquery.dev/reference/api/bigframes.bigquery.html#module-bigframes.bigquery) +is to provide the simplest possible mapping from BigQuery (GoogleSQL) +[functions](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/functions-all) and +[operations](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax) to Python. "Simplest" is somewhat ambiguous +though, when it comes to the types involved an behaviors, so this document aims to expand on that vision with specific examples. + + +## SQL and BigFrames expression types + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SQL expression type(s) + Python type(s) + Notes + Examples +
Column expression (usable in a SELECT clause) + + + Both Python Series and column expression should be supported as inputs, with the output reflecting the users input. Use a TypeVar rather than directly using union types to make type checking easier. +

+Special considerations for Series inputs: +

+If an input and output are both a Series with the same number of rows, make sure the output Series is implicitly (row identity) alignable with the original input. In other words, don't generate a table expression. +

+If there are multiple Series inputs, they should be implicitly aligned if possible so as not to generate unnecessary table expressions. +

Most scalar functions accept one or more column expressions as input. +
Scalar values + + + Theoretically, we could try to get the type system to help the user disambiguate between this case and the "Column expression" case, but I think that's more trouble than it it's worth with regards to the expectations of Python users. + + +
Table expression + bpd.DataFrame +

+All columns are included as normal columns in the input table expression, including named index columns. If column names aren't unique or contain characters not compatible with BigQuery flexible column names, raise an error. +

+Outputs are unordered and unindexed to allow for cleaner mapping with SQL. +

Most APIs that take a table expression as input, also output a table expression with the same number of rows and passing through all unused columns. \ + \ +This should be used to pass through any index or ordering columns (as well as all other columns, if that's the SQL behavior), to allow for easy joining with the original input DataFrame. + Same number of rows as the input, so we should preserve index and ordering:
    + +
  • ML.PREDICT + +

    +Different number of rows in output, so no need to preserve index or ordering. Default index / ordering should be specified with the Session's configuration:

    +
+ +
Table name + string (referring to fully-qualified table ID, e.g. project.dataset.table / project.catalog.namespace.table) + Some SQL APIs do not support or have limitations with arbitrary table expressions, instead taking in a table ID, such as TABLESAMPLE expression. +

+Also, SEARCH and VECTOR_SEARCH, if you want the indexes attached to the table to actually apply. +

+For outputs, it might be preferable to output a table ID instead of a DataFrame, if the user is explicitly creating a table. For example, to_gbq() returns a string with the table name, which is useful for the case where BigFrame generates the table ID for the user. +

All of the items from the "Table expression" row above. APIs that require a table expression, but don't take a table ID can trivially take a table ID through a (SELECT * FROM table) subquery. +

+Some APIs only take a table ID and not an arbitrary table expression:

+ +
Aggregated table expression + DataFrameGroupBy + + + +
Analytic table expression +
    + +
  • DataFrameGroupBy - feasibility TBD +
  • Deferred column Expression with a Window applied.
+ +
+ + +
Column name (unqualified*) \ + \ +*I've only encountered examples where the table name / table expression is passed in separately. + string, +

+For cases where the column name is used as an alias and we aren't using named Series: +

+dict[str, Expression] +

Often a table expression input is paired with a column name input, as is the case with the CREATE MODEL and VECTOR_SEARCH APIs +

+If SQL expects a column name rather than a column expression, do not attempt to change this in Python. For example, don't allow a Series as a substitute for DataFrames + Column name. \ + \ +If the associated table expression is input as a DataFrame, validate that these map cleanly to SQL and raise a ValueError if not. For example: \ +

    + +
  • Duplicate column names (excluding unnamed index columns). +
  • Column names that are some hashable value other than integer (which maps cleanly to a column name) or string. +
  • Any column name containing a punctuation mark that is not allowed by BigQuery flexible column names, such as ! or $.
+ +
+ +
Literal values + corresponding literal Python value (e.g. int, float, string) + For cases where scalar values are also supported, it should be safe to start with this and then expand to support expressions without a breaking change, as is done in https://github.com/googleapis/google-cloud-python/pull/16606. + Most scalar functions accept one or more literal values as input. +
Scalar subqueries + Not supported yet, except implicitly in some aggregation use cases. +

+Would need some sort of bigframes deferred expression that can be tied to a table expression. +

+(Possibly DataFrame with 1 column?) +

+ +
+ + + +## Python policies + + +### Naming + +Take the SQL function name, keyword name (used as a function name in Python), or argument name and transform them to lower_snake_case to reflect Python conventions. + + +### Internal expressions + +Prefer creating deferred BigFrames expression objects where feasible. For example, all scalar outputting functions should return a `bigframes.pandas.Series` or `bigframes.core.col.Expression` that wraps a `bigframes.core.expression.Expression`. + +Prefer returning a `bigframes.pandas.DataFrame` that wraps a `bigframes.bigframes.core` + + +``` +.bigframe_node.BigFrameNode. See from_bq_data_source in bigframes.core +``` + + +`.array_value.ArrayValue`, as an example. + +Exceptions to this are cases where the output schema is likely to evolve or differ in ways that are difficult to model, such as the `ML.PREDICT` SQL function, where output columns differ based on the model type and support for model types are frequently added to BigQuery. In these exceptional cases, the generated query should run immediately and the returned value should wrap the results. + + +### Argument syntax details + +Arguments in Python can be one of: + + + +* Positional + * Supported by `*args` in Python, but not recommended. Positional arguments in SQL should map to named positional or keyword arguments in Python. +* Positional or keyword + * Required positional arguments should be positional, just like they are in SQL. +* Keyword-only + * All other arguments should be keyword-only. Use `, * ,` Python syntax to achieve this. + +For optional parameters, use an optional sentinel (see: https://stackoverflow.com/a/76606310/101923) and omit the value from the generated SQL if the user doesn't explicitly provide one. This ensures that an explicit NULL / None value can be passed in. \ + + + +``` + +from enum import Enum + +class Default(Enum): + token = 0 + +DEFAULT = Default.token + +def spam(*, ham: list[str] | None | Default = DEFAULT): + op_kwargs = {} + + if ham is not DEFAULT: + op_kwargs['ham'] = "prosciutto" + + ... + +``` + + + +### Scalar operations types policies + +Many operations output a table expression. For these, the output type is always a DataFrame, regardless of the the input types. + +For scalar operations, there are three cases to consider when determining the output types: + + + + + + + + + + + + + + + + + + + +
Scalar ops - Input type(s) + Scalar ops - Output type +
Expression + Expression +
Series / DataFrame + Series / DataFrame +

+Preserve ordering and index(es). Join inputs as needed before applying the operation. +

Mix of Expression and Series / DataFrame + Series / DataFrame +

+Preserve ordering and index(es). Join inputs as needed before applying the operation. +

+ + + +## Examples + + +### PIVOT SQL operator + +SQL syntax ([docs](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#pivot_operator)): + + +``` +FROM from_item[, ...] pivot_operator + +pivot_operator: + PIVOT( + aggregate_function_call [as_alias][, ...] + FOR input_column + IN ( pivot_column [as_alias][, ...] ) + ) [AS alias] + +as_alias: + [AS] alias + +``` + + +SQL example: + + +``` +WITH Produce AS ( + SELECT 'Kale' as product, 51 as sales, 'Q1' as quarter, 2020 as year UNION ALL + SELECT 'Kale', 23, 'Q2', 2020 UNION ALL + SELECT 'Kale', 45, 'Q3', 2020 UNION ALL + SELECT 'Kale', 3, 'Q4', 2020 UNION ALL + SELECT 'Kale', 70, 'Q1', 2021 UNION ALL + SELECT 'Kale', 85, 'Q2', 2021 UNION ALL + SELECT 'Apple', 77, 'Q1', 2020 UNION ALL + SELECT 'Apple', 0, 'Q2', 2020 UNION ALL + SELECT 'Apple', 1, 'Q1', 2021) +SELECT * FROM Produce + +/*---------+-------+---------+------+ + | product | sales | quarter | year | + +---------+-------+---------+------| + | Kale | 51 | Q1 | 2020 | + | Kale | 23 | Q2 | 2020 | + | Kale | 45 | Q3 | 2020 | + | Kale | 3 | Q4 | 2020 | + | Kale | 70 | Q1 | 2021 | + | Kale | 85 | Q2 | 2021 | + | Apple | 77 | Q1 | 2020 | + | Apple | 0 | Q2 | 2020 | + | Apple | 1 | Q1 | 2021 | + +---------+-------+---------+------*/ + + +SELECT * FROM + Produce + PIVOT(SUM(sales) FOR quarter IN ('Q1', 'Q2', 'Q3', 'Q4')) + +/*---------+------+----+------+------+------+ + | product | year | Q1 | Q2 | Q3 | Q4 | + +---------+------+----+------+------+------+ + | Apple | 2020 | 77 | 0 | NULL | NULL | + | Apple | 2021 | 1 | NULL | NULL | NULL | + | Kale | 2020 | 51 | 23 | 45 | 3 | + | Kale | 2021 | 70 | 85 | NULL | NULL | + +---------+------+----+------+------+------*/ + +``` + + +Python definition: + + +``` +def pivot( + table_expression: bpd.DataFrame, + *, + aggregation: Expression | dict[str, Expression], + input_column: str, + pivot_columns: dict[str, float | str | ...] | Sequence[float | str | ...], +) -> bpd.DataFrame: + ... +``` + + +Since pivot creates a table expression, we run immediately. + + \ +Python usage: + + +``` +pivotted = bbq.pivot( + my_produce_dataframe, + aggregation=bpd.col("sales").sum(), + input_column="quarter", + pivot_columns=["Q1", "Q2", "Q3", "Q4"], +) +``` + + + +### UNPIVOT SQL operator + +SQL syntax ([docs](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unpivot_operator)): + + +``` +FROM from_item[, ...] unpivot_operator + +unpivot_operator: + UNPIVOT [ { INCLUDE NULLS | EXCLUDE NULLS } ] ( + { single_column_unpivot | multi_column_unpivot } + ) [unpivot_alias] + +single_column_unpivot: + values_column + FOR name_column + IN (columns_to_unpivot) + +multi_column_unpivot: + values_column_set + FOR name_column + IN (column_sets_to_unpivot) + +values_column_set: + (values_column[, ...]) + +columns_to_unpivot: + unpivot_column [row_value_alias][, ...] + +column_sets_to_unpivot: + (unpivot_column [row_value_alias][, ...]) + +unpivot_alias and row_value_alias: + [AS] alias +``` + + +SQL example: + + +``` +WITH Produce AS ( + SELECT 'Kale' as product, 51 as Q1, 23 as Q2, 45 as Q3, 3 as Q4 UNION ALL + SELECT 'Apple', 77, 0, 25, 2) + +-- SELECT * FROM Produce +/*---------+----+----+----+----+ + | product | Q1 | Q2 | Q3 | Q4 | + +---------+----+----+----+----+ + | Kale | 51 | 23 | 45 | 3 | + | Apple | 77 | 0 | 25 | 2 | + +---------+----+----+----+----*/ + +SELECT * FROM Produce +UNPIVOT(sales FOR quarter IN (Q1, Q2, Q3, Q4)) -- single_column_unpivot + +/*---------+-------+---------+ + | product | sales | quarter | + +---------+-------+---------+ + | Kale | 51 | Q1 | + | Kale | 23 | Q2 | + | Kale | 45 | Q3 | + | Kale | 3 | Q4 | + | Apple | 77 | Q1 | + | Apple | 0 | Q2 | + | Apple | 25 | Q3 | + | Apple | 2 | Q4 | + +---------+-------+---------*/ +``` + + +Python definition: + + +``` +def unpivot( + table_expression: bpd.DataFrame, + *, + exclude_nulls: bool = True, + values_column: str | Sequence[str], + name_column: str, + columns_to_unpivot: dict[str, str | int] | Sequence[str], +) -> bpd.DataFrame: + ... +``` + + +Since unpivot creates a table expression, we run immediately. + + \ +Python usage: + + +``` +unpivotted = bbq.unpivot( + my_produce_dataframe, + aggregation=bpd.col("sales").sum(), + input_column="quarter", + pivot_columns=["Q1", "Q2", "Q3", "Q4"], +) +``` From 88bc4a77282341d0f933057cd9345f19058de34e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 6 May 2026 15:34:27 -0500 Subject: [PATCH 2/5] Apply suggestions from code review Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../specs/bigframes-bigquery-contributing.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/bigframes/specs/bigframes-bigquery-contributing.md b/packages/bigframes/specs/bigframes-bigquery-contributing.md index 188f9fb56710..6453540b9826 100644 --- a/packages/bigframes/specs/bigframes-bigquery-contributing.md +++ b/packages/bigframes/specs/bigframes-bigquery-contributing.md @@ -4,7 +4,7 @@ The goal of the [bigframes.bigquery APIs](https://dataframes.bigquery.dev/refere is to provide the simplest possible mapping from BigQuery (GoogleSQL) [functions](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/functions-all) and [operations](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax) to Python. "Simplest" is somewhat ambiguous -though, when it comes to the types involved an behaviors, so this document aims to expand on that vision with specific examples. +though, when it comes to the types involved and behaviors, so this document aims to expand on that vision with specific examples. ## SQL and BigFrames expression types @@ -237,7 +237,7 @@ Arguments in Python can be one of: * Keyword-only * All other arguments should be keyword-only. Use `, * ,` Python syntax to achieve this. -For optional parameters, use an optional sentinel (see: https://stackoverflow.com/a/76606310/101923) and omit the value from the generated SQL if the user doesn't explicitly provide one. This ensures that an explicit NULL / None value can be passed in. \ +For optional parameters, use an optional sentinel (see: https://stackoverflow.com/a/76606310/101923) and omit the value from the generated SQL if the user doesn't explicitly provide one. This ensures that an explicit NULL / None value can be passed in. @@ -264,7 +264,7 @@ def spam(*, ham: list[str] | None | Default = DEFAULT): ### Scalar operations types policies -Many operations output a table expression. For these, the output type is always a DataFrame, regardless of the the input types. +Many operations output a table expression. For these, the output type is always a DataFrame, regardless of the input types. For scalar operations, there are three cases to consider when determining the output types: @@ -501,8 +501,8 @@ Python usage: ``` unpivotted = bbq.unpivot( my_produce_dataframe, - aggregation=bpd.col("sales").sum(), - input_column="quarter", - pivot_columns=["Q1", "Q2", "Q3", "Q4"], + values_column="sales", + name_column="quarter", + columns_to_unpivot=["Q1", "Q2", "Q3", "Q4"], ) ``` From c1b42c981bec1f8ed869a089fe28d606d3897014 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 6 May 2026 15:39:49 -0500 Subject: [PATCH 3/5] formatting --- .../specs/bigframes-bigquery-contributing.md | 126 ++++++++---------- 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/packages/bigframes/specs/bigframes-bigquery-contributing.md b/packages/bigframes/specs/bigframes-bigquery-contributing.md index 6453540b9826..b452e6c357f7 100644 --- a/packages/bigframes/specs/bigframes-bigquery-contributing.md +++ b/packages/bigframes/specs/bigframes-bigquery-contributing.md @@ -1,15 +1,17 @@ # bigframes.bigquery inputs and outputs policies -The goal of the [bigframes.bigquery APIs](https://dataframes.bigquery.dev/reference/api/bigframes.bigquery.html#module-bigframes.bigquery) +The goal of the [bigframes.bigquery +APIs](https://dataframes.bigquery.dev/reference/api/bigframes.bigquery.html#module-bigframes.bigquery) is to provide the simplest possible mapping from BigQuery (GoogleSQL) -[functions](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/functions-all) and -[operations](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax) to Python. "Simplest" is somewhat ambiguous -though, when it comes to the types involved and behaviors, so this document aims to expand on that vision with specific examples. - +[functions](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/functions-all) +and +[operations](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax) +to Python. "Simplest" is somewhat ambiguous though, when it comes to the types +involved and behaviors, so this document aims to expand on that vision with +specific examples. ## SQL and BigFrames expression types - - @@ -49,7 +57,10 @@ If there are multiple Series inputs, they should be implicitly aligned if possib
  • bigframes deferred expression
  • - - - @@ -197,49 +218,42 @@ Would need some sort of bigframes deferred expression that can be tied to a tabl
    SQL expression type(s) @@ -30,13 +32,19 @@ though, when it comes to the types involved and behaviors, so this document aims
  • bigframes deferred expression
  • Both Python Series and column expression should be supported as inputs, with the output reflecting the users input. Use a TypeVar rather than directly using union types to make type checking easier. + Both Python Series and column expression should be supported as inputs, + with the output reflecting the users input. Use a TypeVar + rather than directly using union types to make type checking easier.

    Special considerations for Series inputs:

    -If an input and output are both a Series with the same number of rows, make sure the output Series is implicitly (row identity) alignable with the original input. In other words, don't generate a table expression. +If an input and output are both a Series with the same number of rows, make sure +the output Series is implicitly (row identity) alignable with the original +input. In other words, don't generate a table expression.

    -If there are multiple Series inputs, they should be implicitly aligned if possible so as not to generate unnecessary table expressions. +If there are multiple Series inputs, they should be implicitly aligned if +possible so as not to generate unnecessary table expressions.

    Most scalar functions accept one or more column expressions as input. Theoretically, we could try to get the type system to help the user disambiguate between this case and the "Column expression" case, but I think that's more trouble than it it's worth with regards to the expectations of Python users. + Theoretically, we could try to get the type system to help the user + disambiguate between this case and the "Column expression" case, but I think + that's more trouble than it it's worth with regards to the expectations of + Python users.
      @@ -62,28 +73,38 @@ If there are multiple Series inputs, they should be implicitly aligned if possib
    bpd.DataFrame

    -All columns are included as normal columns in the input table expression, including named index columns. If column names aren't unique or contain characters not compatible with BigQuery flexible column names, raise an error. +All columns are included as normal columns in the input table expression, +including named index columns. If column names aren't unique or contain +characters not compatible with BigQuery flexible column names, raise an error.

    Outputs are unordered and unindexed to allow for cleaner mapping with SQL.

    Most APIs that take a table expression as input, also output a table expression with the same number of rows and passing through all unused columns. \ - \ -This should be used to pass through any index or ordering columns (as well as all other columns, if that's the SQL behavior), to allow for easy joining with the original input DataFrame. + Most APIs that take a table expression as input, also output a table + expression with the same number of rows and passing through all unused + columns. + +

    This should be used to pass through any index or ordering columns (as well + as all other columns, if that's the SQL behavior), to allow for easy joining + with the original input DataFrame.

    Same number of rows as the input, so we should preserve index and ordering:
    • ML.PREDICT

      -Different number of rows in output, so no need to preserve index or ordering. Default index / ordering should be specified with the Session's configuration:

        +Different number of rows in output, so no need to preserve index or ordering. +Default index / ordering should be specified with the Session's +configuration: +
        • CREATE MODEL
        • SEARCH
        • VECTOR_SEARCH

          -Possible to have the same number of rows as the input, but joining with the original goes against the purpose of the feature:

            +Possible to have the same number of rows as the input, but joining with the original goes against the purpose of the feature: +
        @@ -118,7 +139,7 @@ Some APIs only take a table ID and not an arbitrary table expression:
    @@ -174,7 +195,7 @@ If the associated table expression is input as a DataFrame, validate that these
    Literal values corresponding literal Python value (e.g. int, float, string) + corresponding literal Python value (e.g. int, float, string) For cases where scalar values are also supported, it should be safe to start with this and then expand to support expressions without a breaking change, as is done in https://github.com/googleapis/google-cloud-python/pull/16606.
    - - ## Python policies - ### Naming Take the SQL function name, keyword name (used as a function name in Python), or argument name and transform them to lower_snake_case to reflect Python conventions. - ### Internal expressions -Prefer creating deferred BigFrames expression objects where feasible. For example, all scalar outputting functions should return a `bigframes.pandas.Series` or `bigframes.core.col.Expression` that wraps a `bigframes.core.expression.Expression`. +Prefer creating deferred BigFrames expression objects where feasible. For +example, all scalar outputting functions should return a +`bigframes.pandas.Series` or `bigframes.core.col.Expression` that wraps a +`bigframes.core.expression.Expression`. -Prefer returning a `bigframes.pandas.DataFrame` that wraps a `bigframes.bigframes.core` - - -``` -.bigframe_node.BigFrameNode. See from_bq_data_source in bigframes.core -``` - - -`.array_value.ArrayValue`, as an example. - -Exceptions to this are cases where the output schema is likely to evolve or differ in ways that are difficult to model, such as the `ML.PREDICT` SQL function, where output columns differ based on the model type and support for model types are frequently added to BigQuery. In these exceptional cases, the generated query should run immediately and the returned value should wrap the results. +Prefer returning a `bigframes.pandas.DataFrame` that wraps a +`bigframes.bigframes.core.bigframe_node.BigFrameNode`. See `from_bq_data_source` in +`bigframes.core.array_value.ArrayValue`, as an example. +Exceptions to this are cases where the output schema is likely to evolve or +differ in ways that are difficult to model, such as the `ML.PREDICT` SQL +function, where output columns differ based on the model type and support for +model types are frequently added to BigQuery. In these exceptional cases, the +generated query should run immediately and the returned value should wrap the +results. ### Argument syntax details Arguments in Python can be one of: +* Positional + * Supported by `*args` in Python, but not recommended. Positional arguments in SQL should map to named positional or keyword arguments in Python. +* Positional or keyword + * Required positional arguments should be positional, just like they are in SQL. +* Keyword-only + * All other arguments should be keyword-only. Use `, * ,` Python syntax to achieve this. - -* Positional - * Supported by `*args` in Python, but not recommended. Positional arguments in SQL should map to named positional or keyword arguments in Python. -* Positional or keyword - * Required positional arguments should be positional, just like they are in SQL. -* Keyword-only - * All other arguments should be keyword-only. Use `, * ,` Python syntax to achieve this. - -For optional parameters, use an optional sentinel (see: https://stackoverflow.com/a/76606310/101923) and omit the value from the generated SQL if the user doesn't explicitly provide one. This ensures that an explicit NULL / None value can be passed in. - - +For optional parameters, use an optional sentinel (see: ) and omit the value from the generated SQL if the user doesn't explicitly provide one. This ensures that an explicit NULL / None value can be passed in. ``` @@ -260,15 +274,12 @@ def spam(*, ham: list[str] | None | Default = DEFAULT): ``` - - ### Scalar operations types policies Many operations output a table expression. For these, the output type is always a DataFrame, regardless of the input types. For scalar operations, there are three cases to consider when determining the output types: -
    Scalar ops - Input type(s) @@ -300,16 +311,12 @@ Preserve ordering and index(es). Join inputs as needed before applying the opera
    - - ## Examples - ### PIVOT SQL operator SQL syntax ([docs](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#pivot_operator)): - ``` FROM from_item[, ...] pivot_operator @@ -325,10 +332,8 @@ as_alias: ``` - SQL example: - ``` WITH Produce AS ( SELECT 'Kale' as product, 51 as sales, 'Q1' as quarter, 2020 as year UNION ALL @@ -372,10 +377,8 @@ SELECT * FROM ``` - Python definition: - ``` def pivot( table_expression: bpd.DataFrame, @@ -387,13 +390,11 @@ def pivot( ... ``` - Since pivot creates a table expression, we run immediately. \ Python usage: - ``` pivotted = bbq.pivot( my_produce_dataframe, @@ -403,13 +404,10 @@ pivotted = bbq.pivot( ) ``` - - ### UNPIVOT SQL operator SQL syntax ([docs](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unpivot_operator)): - ``` FROM from_item[, ...] unpivot_operator @@ -441,10 +439,8 @@ unpivot_alias and row_value_alias: [AS] alias ``` - SQL example: - ``` WITH Produce AS ( SELECT 'Kale' as product, 51 as Q1, 23 as Q2, 45 as Q3, 3 as Q4 UNION ALL @@ -475,10 +471,8 @@ UNPIVOT(sales FOR quarter IN (Q1, Q2, Q3, Q4)) -- single_column_unpivot +---------+-------+---------*/ ``` - Python definition: - ``` def unpivot( table_expression: bpd.DataFrame, @@ -491,13 +485,11 @@ def unpivot( ... ``` - Since unpivot creates a table expression, we run immediately. \ Python usage: - ``` unpivotted = bbq.unpivot( my_produce_dataframe, From 42aadb51825d00e9b46bb48ee52a01d84507f798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 6 May 2026 15:42:07 -0500 Subject: [PATCH 4/5] list formatting --- .../specs/bigframes-bigquery-contributing.md | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/packages/bigframes/specs/bigframes-bigquery-contributing.md b/packages/bigframes/specs/bigframes-bigquery-contributing.md index b452e6c357f7..10931af0755f 100644 --- a/packages/bigframes/specs/bigframes-bigquery-contributing.md +++ b/packages/bigframes/specs/bigframes-bigquery-contributing.md @@ -82,32 +82,33 @@ Outputs are unordered and unindexed to allow for cleaner mapping with SQL. Most APIs that take a table expression as input, also output a table expression with the same number of rows and passing through all unused columns. - +

    This should be used to pass through any index or ordering columns (as well as all other columns, if that's the SQL behavior), to allow for easy joining with the original input DataFrame. - Same number of rows as the input, so we should preserve index and ordering:

      + Same number of rows as the input, so we should preserve index and ordering: -
    • ML.PREDICT + -

      -Different number of rows in output, so no need to preserve index or ordering. -Default index / ordering should be specified with the Session's -configuration: +

      + Different number of rows in output, so no need to preserve index or ordering. + Default index / ordering should be specified with the Session's + configuration: -

      -
    + From e4cc128ceaa263c2783d93e380fd6c13498fcba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 7 May 2026 19:42:43 +0000 Subject: [PATCH 5/5] fix prerelease tests --- packages/bigframes/noxfile.py | 4 ++-- .../tests/system/small/test_series.py | 24 ++++++------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/packages/bigframes/noxfile.py b/packages/bigframes/noxfile.py index 5dba688d3c4a..09364c4e6ff9 100644 --- a/packages/bigframes/noxfile.py +++ b/packages/bigframes/noxfile.py @@ -605,11 +605,11 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 "db-dtypes", # Ensure we catch breaking changes in the client libraries early. - "git+https://github.com/googleapis/python-bigquery.git#egg=google-cloud-bigquery", + "git+https://github.com/googleapis/google-cloud-python.git#egg=google-cloud-bigquery&subdirectory=packages/google-cloud-bigquery", "--upgrade", "-e", "git+https://github.com/googleapis/google-cloud-python.git#egg=google-cloud-bigquery-storage&subdirectory=packages/google-cloud-bigquery-storage", - "git+https://github.com/googleapis/python-bigquery-pandas.git#egg=pandas-gbq", + "git+https://github.com/googleapis/google-cloud-python.git#egg=pandas-gbq&subdirectory=packages/pandas-gbq", ) # Print out prerelease package versions. diff --git a/packages/bigframes/tests/system/small/test_series.py b/packages/bigframes/tests/system/small/test_series.py index c1ca6ebba55f..5df88e930432 100644 --- a/packages/bigframes/tests/system/small/test_series.py +++ b/packages/bigframes/tests/system/small/test_series.py @@ -33,6 +33,7 @@ import bigframes.pandas import bigframes.series as series import bigframes.testing +import bigframes.testing.utils from bigframes.testing.utils import ( assert_frame_equal, assert_series_equal, @@ -1232,23 +1233,12 @@ def test_divmods_series(scalars_dfs, col_x, col_y, method): scalars_pandas_df[col_y] ) # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. - if bf_div_result.dtype == pd.Int64Dtype(): - bigframes.testing.utils.assert_series_equal( - pd_div_result, bf_div_result.to_pandas(), check_dtype=False - ) - else: - bigframes.testing.utils.assert_series_equal( - pd_div_result, bf_div_result.astype("Float64").to_pandas() - ) - - if bf_mod_result.dtype == pd.Int64Dtype(): - bigframes.testing.utils.assert_series_equal( - pd_mod_result, bf_mod_result.to_pandas() - ) - else: - bigframes.testing.utils.assert_series_equal( - pd_mod_result, bf_mod_result.astype("Float64").to_pandas() - ) + bigframes.testing.utils.assert_series_equal( + pd_div_result, bf_div_result.to_pandas(), check_dtype=False + ) + bigframes.testing.utils.assert_series_equal( + pd_mod_result, bf_mod_result.to_pandas(), check_dtype=False + ) @pytest.mark.parametrize(