from pyspark.sql.functions import col
column = col('foo').alias('bar')
print(column)
#Column<foo AS `bar`>
import re
print(re.findall("(?<=AS `)\w+(?=`>$)", str(column)))[0]
#'bar'
assert(col("foo").alias("bar").AS == "bar")
# `name` should act like `alias`
assert(col("foo").name("bar").AS == "bar")
# column without alias should have None in `AS`
assert(col("foo").AS is None)
# multialias should be handled
assert(explode(array(struct(lit(1), lit("a")))).alias("foo", "bar").AS == ("foo", "bar"))
from pyspark.sql.functions import col
col_1 = col('foo')
col_2 = col('foo').alias('bar')
col_3 = col('foo').alias('bar').alias('baz')
s = str(col_1)
print(col_1)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo'>
# foo
s = str(col_2)
print(col_2)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo AS bar'>
# bar
s = str(col_3)
print(col_3)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo AS bar AS baz'>
# baz
from pyspark.sql import Column
def get_column_name(col: Column) -> str:
"""
PySpark doesn't allow you to directly access the column name with respect to aliases
from an unbound column. We have to parse this out from the string representation.
This works on columns with one or more aliases as well as unaliased columns.
Returns:
Col name as str, with respect to aliasing
"""
c = str(col).lstrip("Column<'").rstrip("'>")
return c.split(' AS ')[-1]
用于验证行为的一些测试:
import pytest
from pyspark.sql import SparkSession
@pytest.fixture(scope="session")
def spark() -> SparkSession:
# Provide a session spark fixture for all tests
yield SparkSession.builder.getOrCreate()
def test_get_col_name(spark):
col = f.col('a')
actual = get_column_name(col)
assert actual == 'a'
def test_get_col_name_alias(spark):
col = f.col('a').alias('b')
actual = get_column_name(col)
assert actual == 'b'
def test_get_col_name_multiple_alias(spark):
col = f.col('a').alias('b').alias('c')
actual = get_column_name(col)
assert actual == 'c'
4条答案
按热度按时间mwkjh3gx1#
一种方法是通过正则表达式:
yqyhoc1h2#
或者,我们可以使用一个 Package 器函数来调整
Column.alias
和Column.name
方法的行为,将 alias only 存储在AS
属性中:从而保证:
rmbxnbpk3#
我注意到在一些系统中,你可能会在列的名字周围有倒勾。
选项1(无正则表达式):第一个月
选项2(正则表达式):模式
'.*?
?(\w+)?'
看起来足够安全:re.search(r"'.*?
?(\w+)?'", str(col)).group(1)
kdfy810k4#
对于PySpark 3.x来说,倒勾号看起来被引号取代了,所以这在早期的Spark版本上可能不能开箱即用,但应该很容易修改。
用于验证行为的一些测试: