我正试图解决这个问题:[访问sparkDataframe中向量的元素(logistic回归概率向量),但在pyspark中不使用udf我在scala中看到了很多选择,但pyspark没有。
hgb9j2n61#
我希望你有预测Dataframe如下-
predictions.show(false)
+-----------+--------------------------------------+------------+--------------------------------------+----------------------------------------------------------+-------------------------------------------------------------+----------+---------------+ |label |features |indexedLabel|indexedFeatures |rawPrediction |probability |prediction|predictedLabel | +-----------+--------------------------------------+------------+--------------------------------------+----------------------------------------------------------+-------------------------------------------------------------+----------+---------------+ |Iris-setosa|(123,[0,37,82,101],[1.0,1.0,1.0,1.0]) |0.0 |(123,[0,37,82,101],[1.0,1.0,1.0,1.0]) |[7.094347002635046,1.7433876811594202,1.1622653162055336] |[0.7094347002635046,0.174338768115942,0.11622653162055337] |0.0 |Iris-setosa | |Iris-setosa|(123,[0,39,58,101],[1.0,1.0,1.0,1.0]) |0.0 |(123,[0,39,58,101],[1.0,1.0,1.0,1.0]) |[7.867074275362319,1.2433876811594202,0.8895380434782609] |[0.7867074275362319,0.12433876811594202,0.0889538043478261] |0.0 |Iris-setosa | |Iris-setosa|(123,[0,39,62,107],[1.0,1.0,1.0,1.0]) |0.0 |(123,[0,39,62,107],[1.0,1.0,1.0,1.0]) |[5.159492704509035,2.794443583750028,2.046063711740936] |[0.5159492704509036,0.2794443583750028,0.2046063711740936] |0.0 |Iris-setosa | |Iris-setosa|(123,[2,39,58,101],[1.0,1.0,1.0,1.0]) |0.0 |(123,[2,39,58,101],[1.0,1.0,1.0,1.0]) |[7.822379507920459,1.2164981462756994,0.9611223458038423] |[0.7822379507920459,0.12164981462756994,0.09611223458038423] |0.0 |Iris-setosa | |Iris-setosa|(123,[2,43,62,101],[1.0,1.0,1.0,1.0]) |0.0 |(123,[2,43,62,101],[1.0,1.0,1.0,1.0]) |[7.049652235193186,1.7164981462756992,1.233849618531115] |[0.7049652235193186,0.17164981462756992,0.1233849618531115] |0.0 |Iris-setosa | |Iris-setosa|(123,[2,48,58,107],[1.0,1.0,1.0,1.0]) |0.0 |(123,[2,48,58,107],[1.0,1.0,1.0,1.0]) |[4.375677221716952,3.456351863073957,2.167970915209091] |[0.4375677221716952,0.3456351863073957,0.21679709152090912] |0.0 |Iris-setosa | |Iris-setosa|(123,[4,43,71,108],[1.0,1.0,1.0,1.0]) |0.0 |(123,[4,43,71,108],[1.0,1.0,1.0,1.0]) |[3.521332027976265,3.4847888973511556,2.9938790746725785] |[0.3521332027976265,0.34847888973511554,0.29938790746725785] |0.0 |Iris-setosa | |Iris-setosa|(123,[4,54,58,107],[1.0,1.0,1.0,1.0]) |0.0 |(123,[4,54,58,107],[1.0,1.0,1.0,1.0]) |[3.6413022217169515,3.925101863073957,2.433595915209091] |[0.36413022217169516,0.3925101863073957,0.2433595915209091] |1.0 |Iris-versicolor| |Iris-setosa|(123,[10,38,58,110],[1.0,1.0,1.0,1.0])|0.0 |(123,[10,38,58,110],[1.0,1.0,1.0,1.0])|[4.537038655825478,3.3499080646243447,2.1130532795501766] |[0.4537038655825478,0.33499080646243445,0.21130532795501766] |0.0 |Iris-setosa | |Iris-setosa|(123,[12,48,58,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[12,48,58,101],[1.0,1.0,1.0,1.0])|[6.315277235193186,2.185248146275699,1.499474618531115] |[0.6315277235193186,0.21852481462756992,0.1499474618531115] |0.0 |Iris-setosa | |Iris-setosa|(123,[12,52,71,107],[1.0,1.0,1.0,1.0])|0.0 |(123,[12,52,71,107],[1.0,1.0,1.0,1.0])|[2.657695664339902,4.252970715532974,3.0893336201271238] |[0.2657695664339902,0.4252970715532974,0.3089333620127124] |1.0 |Iris-versicolor| |Iris-setosa|(123,[13,38,62,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[13,38,62,101],[1.0,1.0,1.0,1.0])|[6.315277235193186,2.185248146275699,1.499474618531115] |[0.6315277235193186,0.21852481462756992,0.1499474618531115] |0.0 |Iris-setosa | |Iris-setosa|(123,[15,39,59,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[15,39,59,101],[1.0,1.0,1.0,1.0])|[9.434782608695652,0.391304347826087,0.17391304347826086] |[0.9434782608695651,0.03913043478260869,0.017391304347826084]|0.0 |Iris-setosa | |Iris-setosa|(123,[17,37,59,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[17,37,59,101],[1.0,1.0,1.0,1.0])|[8.662055335968379,0.8913043478260869,0.44664031620553357]|[0.866205533596838,0.08913043478260871,0.044664031620553366] |0.0 |Iris-setosa | |Iris-setosa|(123,[17,39,59,108],[1.0,1.0,1.0,1.0])|0.0 |(123,[17,39,59,108],[1.0,1.0,1.0,1.0])|[6.818110128751459,1.6741784322348767,1.5077114390136637] |[0.681811012875146,0.1674178432234877,0.1507711439013664] |0.0 |Iris-setosa | |Iris-setosa|(123,[20,35,63,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[20,35,63,101],[1.0,1.0,1.0,1.0])|[8.393939393939394,0.8833333333333333,0.7227272727272727] |[0.8393939393939395,0.08833333333333333,0.07227272727272727] |0.0 |Iris-setosa | |Iris-setosa|(123,[25,37,62,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[25,37,62,101],[1.0,1.0,1.0,1.0])|[6.315277235193186,2.185248146275699,1.499474618531115] |[0.6315277235193186,0.21852481462756992,0.1499474618531115] |0.0 |Iris-setosa | |Iris-setosa|(123,[27,47,63,108],[1.0,1.0,1.0,1.0])|0.0 |(123,[27,47,63,108],[1.0,1.0,1.0,1.0])|[4.916336653323167,2.646676038886771,2.4369873077900626] |[0.4916336653323167,0.2646676038886771,0.24369873077900625] |0.0 |Iris-setosa | |Iris-setosa|(123,[29,48,58,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[29,48,58,101],[1.0,1.0,1.0,1.0])|[6.315277235193186,2.185248146275699,1.499474618531115] |[0.6315277235193186,0.21852481462756992,0.1499474618531115] |0.0 |Iris-setosa | |Iris-setosa|(123,[33,35,96,110],[1.0,1.0,1.0,1.0])|0.0 |(123,[33,35,96,110],[1.0,1.0,1.0,1.0])|[2.6034320984484296,4.194443583750028,3.2021243178015424] |[0.26034320984484294,0.41944435837500277,0.32021243178015424]|1.0 |Iris-versicolor| +-----------+--------------------------------------+------------+--------------------------------------+----------------------------------------------------------+-------------------------------------------------------------+----------+---------------+ only showing top 20 rows
请执行下面的查询以获取概率向量ps:有其他编程方法来获取概率向量,但这是我认为不使用自定义项的方式
predictions.selectExpr("*", "get_json_object(to_json(struct(probability)), '$.probability.values[0]') as fetch_probability").show(false)
predictions.selectExpr("*", "get_json_object(to_json(struct(probability)), '$.probability.values[0]') as fetch_probability").show(false) +-----------+--------------------------------------+------------+--------------------------------------+----------------------------------------------------------+-------------------------------------------------------------+----------+---------------+-------------------+ |label |features |indexedLabel|indexedFeatures |rawPrediction |probability |prediction|predictedLabel |fetch_probability | +-----------+--------------------------------------+------------+--------------------------------------+----------------------------------------------------------+-------------------------------------------------------------+----------+---------------+-------------------+ |Iris-setosa|(123,[0,37,82,101],[1.0,1.0,1.0,1.0]) |0.0 |(123,[0,37,82,101],[1.0,1.0,1.0,1.0]) |[7.094347002635046,1.7433876811594202,1.1622653162055336] |[0.7094347002635046,0.174338768115942,0.11622653162055337] |0.0 |Iris-setosa |0.7094347002635046 | |Iris-setosa|(123,[0,39,58,101],[1.0,1.0,1.0,1.0]) |0.0 |(123,[0,39,58,101],[1.0,1.0,1.0,1.0]) |[7.867074275362319,1.2433876811594202,0.8895380434782609] |[0.7867074275362319,0.12433876811594202,0.0889538043478261] |0.0 |Iris-setosa |0.7867074275362319 | |Iris-setosa|(123,[0,39,62,107],[1.0,1.0,1.0,1.0]) |0.0 |(123,[0,39,62,107],[1.0,1.0,1.0,1.0]) |[5.159492704509035,2.794443583750028,2.046063711740936] |[0.5159492704509036,0.2794443583750028,0.2046063711740936] |0.0 |Iris-setosa |0.5159492704509036 | |Iris-setosa|(123,[2,39,58,101],[1.0,1.0,1.0,1.0]) |0.0 |(123,[2,39,58,101],[1.0,1.0,1.0,1.0]) |[7.822379507920459,1.2164981462756994,0.9611223458038423] |[0.7822379507920459,0.12164981462756994,0.09611223458038423] |0.0 |Iris-setosa |0.7822379507920459 | |Iris-setosa|(123,[2,43,62,101],[1.0,1.0,1.0,1.0]) |0.0 |(123,[2,43,62,101],[1.0,1.0,1.0,1.0]) |[7.049652235193186,1.7164981462756992,1.233849618531115] |[0.7049652235193186,0.17164981462756992,0.1233849618531115] |0.0 |Iris-setosa |0.7049652235193186 | |Iris-setosa|(123,[2,48,58,107],[1.0,1.0,1.0,1.0]) |0.0 |(123,[2,48,58,107],[1.0,1.0,1.0,1.0]) |[4.375677221716952,3.456351863073957,2.167970915209091] |[0.4375677221716952,0.3456351863073957,0.21679709152090912] |0.0 |Iris-setosa |0.4375677221716952 | |Iris-setosa|(123,[4,43,71,108],[1.0,1.0,1.0,1.0]) |0.0 |(123,[4,43,71,108],[1.0,1.0,1.0,1.0]) |[3.521332027976265,3.4847888973511556,2.9938790746725785] |[0.3521332027976265,0.34847888973511554,0.29938790746725785] |0.0 |Iris-setosa |0.3521332027976265 | |Iris-setosa|(123,[4,54,58,107],[1.0,1.0,1.0,1.0]) |0.0 |(123,[4,54,58,107],[1.0,1.0,1.0,1.0]) |[3.6413022217169515,3.925101863073957,2.433595915209091] |[0.36413022217169516,0.3925101863073957,0.2433595915209091] |1.0 |Iris-versicolor|0.36413022217169516| |Iris-setosa|(123,[10,38,58,110],[1.0,1.0,1.0,1.0])|0.0 |(123,[10,38,58,110],[1.0,1.0,1.0,1.0])|[4.537038655825478,3.3499080646243447,2.1130532795501766] |[0.4537038655825478,0.33499080646243445,0.21130532795501766] |0.0 |Iris-setosa |0.4537038655825478 | |Iris-setosa|(123,[12,48,58,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[12,48,58,101],[1.0,1.0,1.0,1.0])|[6.315277235193186,2.185248146275699,1.499474618531115] |[0.6315277235193186,0.21852481462756992,0.1499474618531115] |0.0 |Iris-setosa |0.6315277235193186 | |Iris-setosa|(123,[12,52,71,107],[1.0,1.0,1.0,1.0])|0.0 |(123,[12,52,71,107],[1.0,1.0,1.0,1.0])|[2.657695664339902,4.252970715532974,3.0893336201271238] |[0.2657695664339902,0.4252970715532974,0.3089333620127124] |1.0 |Iris-versicolor|0.2657695664339902 | |Iris-setosa|(123,[13,38,62,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[13,38,62,101],[1.0,1.0,1.0,1.0])|[6.315277235193186,2.185248146275699,1.499474618531115] |[0.6315277235193186,0.21852481462756992,0.1499474618531115] |0.0 |Iris-setosa |0.6315277235193186 | |Iris-setosa|(123,[15,39,59,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[15,39,59,101],[1.0,1.0,1.0,1.0])|[9.434782608695652,0.391304347826087,0.17391304347826086] |[0.9434782608695651,0.03913043478260869,0.017391304347826084]|0.0 |Iris-setosa |0.9434782608695651 | |Iris-setosa|(123,[17,37,59,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[17,37,59,101],[1.0,1.0,1.0,1.0])|[8.662055335968379,0.8913043478260869,0.44664031620553357]|[0.866205533596838,0.08913043478260871,0.044664031620553366] |0.0 |Iris-setosa |0.866205533596838 | |Iris-setosa|(123,[17,39,59,108],[1.0,1.0,1.0,1.0])|0.0 |(123,[17,39,59,108],[1.0,1.0,1.0,1.0])|[6.818110128751459,1.6741784322348767,1.5077114390136637] |[0.681811012875146,0.1674178432234877,0.1507711439013664] |0.0 |Iris-setosa |0.681811012875146 | |Iris-setosa|(123,[20,35,63,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[20,35,63,101],[1.0,1.0,1.0,1.0])|[8.393939393939394,0.8833333333333333,0.7227272727272727] |[0.8393939393939395,0.08833333333333333,0.07227272727272727] |0.0 |Iris-setosa |0.8393939393939395 | |Iris-setosa|(123,[25,37,62,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[25,37,62,101],[1.0,1.0,1.0,1.0])|[6.315277235193186,2.185248146275699,1.499474618531115] |[0.6315277235193186,0.21852481462756992,0.1499474618531115] |0.0 |Iris-setosa |0.6315277235193186 | |Iris-setosa|(123,[27,47,63,108],[1.0,1.0,1.0,1.0])|0.0 |(123,[27,47,63,108],[1.0,1.0,1.0,1.0])|[4.916336653323167,2.646676038886771,2.4369873077900626] |[0.4916336653323167,0.2646676038886771,0.24369873077900625] |0.0 |Iris-setosa |0.4916336653323167 | |Iris-setosa|(123,[29,48,58,101],[1.0,1.0,1.0,1.0])|0.0 |(123,[29,48,58,101],[1.0,1.0,1.0,1.0])|[6.315277235193186,2.185248146275699,1.499474618531115] |[0.6315277235193186,0.21852481462756992,0.1499474618531115] |0.0 |Iris-setosa |0.6315277235193186 | |Iris-setosa|(123,[33,35,96,110],[1.0,1.0,1.0,1.0])|0.0 |(123,[33,35,96,110],[1.0,1.0,1.0,1.0])|[2.6034320984484296,4.194443583750028,3.2021243178015424] |[0.26034320984484294,0.41944435837500277,0.32021243178015424]|1.0 |Iris-versicolor|0.26034320984484294| +-----------+--------------------------------------+------------+--------------------------------------+----------------------------------------------------------+-------------------------------------------------------------+----------+---------------+-------------------+ only showing top 20 rows
请遵守访问特征向量的函数,从0开始根据需要更改索引-
accessing first element from probability vector- get_json_object(to_json(struct(probability)), '$.probability.values[0]') accessing second element from probability vector get_json_object(to_json(struct(probability)), '$.probability.values[1]')
1条答案
按热度按时间hgb9j2n61#
我希望你有预测Dataframe如下-
请执行下面的查询以获取概率向量ps:有其他编程方法来获取概率向量,但这是我认为不使用自定义项的方式
请遵守访问特征向量的函数,从0开始根据需要更改索引-