python 在条形图中表示统计学显著性差异

omvjsjqw  于 2023-05-16  发布在  Python
关注(0)|答案(4)|浏览(195)

我用条形图来表示每组的数据。这些条中的一些彼此显著不同。如何在条形图中指出显著性差异?

import numpy as np
import matplotlib.pyplot as plt
menMeans   = (5, 15, 30, 40)
menStd     = (2, 3, 4, 5)
ind = np.arange(4)    # the x locations for the groups
width=0.35
p1 = plt.bar(ind, menMeans, width=width, color='r', yerr=menStd)
plt.xticks(ind+width/2., ('A', 'B', 'C', 'D') )

我的目标是

azpvetkf

azpvetkf1#

上面的答案启发我自己写了一个小而灵活的函数:

def barplot_annotate_brackets(num1, num2, data, center, height, yerr=None, dh=.05, barh=.05, fs=None, maxasterix=None):
    """ 
    Annotate barplot with p-values.

    :param num1: number of left bar to put bracket over
    :param num2: number of right bar to put bracket over
    :param data: string to write or number for generating asterixes
    :param center: centers of all bars (like plt.bar() input)
    :param height: heights of all bars (like plt.bar() input)
    :param yerr: yerrs of all bars (like plt.bar() input)
    :param dh: height offset over bar / bar + yerr in axes coordinates (0 to 1)
    :param barh: bar height in axes coordinates (0 to 1)
    :param fs: font size
    :param maxasterix: maximum number of asterixes to write (for very small p-values)
    """

    if type(data) is str:
        text = data
    else:
        # * is p < 0.05
        # ** is p < 0.005
        # *** is p < 0.0005
        # etc.
        text = ''
        p = .05

        while data < p:
            text += '*'
            p /= 10.

            if maxasterix and len(text) == maxasterix:
                break

        if len(text) == 0:
            text = 'n. s.'

    lx, ly = center[num1], height[num1]
    rx, ry = center[num2], height[num2]

    if yerr:
        ly += yerr[num1]
        ry += yerr[num2]

    ax_y0, ax_y1 = plt.gca().get_ylim()
    dh *= (ax_y1 - ax_y0)
    barh *= (ax_y1 - ax_y0)

    y = max(ly, ry) + dh

    barx = [lx, lx, rx, rx]
    bary = [y, y+barh, y+barh, y]
    mid = ((lx+rx)/2, y+barh)

    plt.plot(barx, bary, c='black')

    kwargs = dict(ha='center', va='bottom')
    if fs is not None:
        kwargs['fontsize'] = fs

    plt.text(*mid, text, **kwargs)

这允许我得到一些相对简单的漂亮注解,例如:

heights = [1.8, 2, 3]
bars = np.arange(len(heights))

plt.figure()
plt.bar(bars, heights, align='center')
plt.ylim(0, 5)
barplot_annotate_brackets(0, 1, .1, bars, heights)
barplot_annotate_brackets(1, 2, .001, bars, heights)
barplot_annotate_brackets(0, 2, 'p < 0.0075', bars, heights, dh=.2)

ymzxtsji

ymzxtsji2#

我在这里做了几件事情,我建议在处理复杂的情节时。将自定义格式提取到字典中,当您想要更改参数时,它使生活变得简单-并且您可以将此字典传递给多个图。我还编写了一个自定义函数来annotate itervalues,作为奖励,如果你真的想的话,它可以在(A,C)之间进行注解(我坚持我的评论,这不是正确的视觉方法)。一旦数据发生变化,它可能需要一些调整,但这应该会让你走上正确的轨道。

import numpy as np
import matplotlib.pyplot as plt
menMeans   = (5, 15, 30, 40)
menStd     = (2, 3, 4, 5)
ind  = np.arange(4)    # the x locations for the groups
width= 0.7
labels = ('A', 'B', 'C', 'D')

# Pull the formatting out here
bar_kwargs = {'width':width,'color':'y','linewidth':2,'zorder':5}
err_kwargs = {'zorder':0,'fmt':None,'linewidth':2,'ecolor':'k'}  #for matplotlib >= v1.4 use 'fmt':'none' instead

fig, ax = plt.subplots()
ax.p1 = plt.bar(ind, menMeans, **bar_kwargs)
ax.errs = plt.errorbar(ind, menMeans, yerr=menStd, **err_kwargs)

# Custom function to draw the diff bars

def label_diff(i,j,text,X,Y):
    x = (X[i]+X[j])/2
    y = 1.1*max(Y[i], Y[j])
    dx = abs(X[i]-X[j])

    props = {'connectionstyle':'bar','arrowstyle':'-',\
                 'shrinkA':20,'shrinkB':20,'linewidth':2}
    ax.annotate(text, xy=(X[i],y+7), zorder=10)
    ax.annotate('', xy=(X[i],y), xytext=(X[j],y), arrowprops=props)

# Call the function
label_diff(0,1,'p=0.0370',ind,menMeans)
label_diff(1,2,'p<0.0001',ind,menMeans)
label_diff(2,3,'p=0.0025',ind,menMeans)

plt.ylim(ymax=60)
plt.xticks(ind, labels, color='k')
plt.show()

camsedfj

camsedfj3#

如果你正在使用matplotlib并寻求boxplot注解,请使用我的代码作为函数:

统计标注

def AnnoMe(x1, x2, ARRAY, TXT):
    y, h, col = max(max(ARRAY[x1-1]),max(ARRAY[x2-1])) + 2, 2, 'k'
    plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
    plt.text((x1+x2)*.5, y+h, TXT, ha='center', va='bottom', color=col)

其中'x1'和'x2'是您要比较的两列,'ARRAY'是您用于说明箱线图的列表列表。并且,'TXT'是您的文本,如字符串格式的p值或显着/不显着。
因此,请使用以下命令调用它:

AnnoMe(1, 2, MyArray, "p-value=0.02")
ecfdbz9o

ecfdbz9o4#

pandas dataframe的分组条形图

注解条形图之间的显著差异

我已经修改了@ cheresmate的解决方案,以便在输入中也接收pandas dataframes。此功能已使用matplotlib 3.5.1进行测试

def annotate_barplot_dataframe(bar0, bar1, text, patches, dh=0.2):
    """Annotate a grouped barplot from a pandas dataframe
    An annotation is added to the figure from bar0 to bar1

    Args:
        bar0 (int): index of first bar
        bar1 (int): index of second bar
        text (string): what to write on the annotation
        patches (matplotlib.patches): data source
        df (float): height of the annotation bar
    """
    patches.sort(key=lambda x: x.xy[0])
    left = patches[bar0]
    right = patches[bar1]

    y = max(left._height, right._height) + dh

    l_bbox = left.get_bbox()
    l_mid = l_bbox.x1 - left._width / 2

    r_bbox = right.get_bbox()
    r_mid = r_bbox.x1 - right._width / 2

    barh = 0.07
    # lower-left, upper-left, upper-right, lower-right
    barx = [l_mid, l_mid, r_mid, r_mid]
    bary = [
        y,
        y + barh,
        y + barh,
        y,
    ]
    plt.plot(barx, bary, c="black")
    kwargs = dict(ha="center", va="bottom")
    mid = ((l_mid + r_mid) / 2, y + 0.01)
    plt.text(*mid, text, **kwargs)

def prepare_df(filename):
    """load filename is exists and prepare it for the plot

    Args:
        filename (string): must be a .xlsx file

    Returns:
        pandas.df: grouped dataframe
    """
    assert filename.endswith("xlsx"), "Check file extension"

    try:
        df = pd.read_excel(filename, sheet_name=0, usecols="H:W", engine="openpyxl")
    except Exception as e:
        raise ValueError(e)
    # Columnkey is the variable by which we want to group
    # e.g. in this example columnskey's entries have 3 different values
    grouped = df.groupby(df["Columnkey"])

    df_group1 = grouped.get_group(1)
    df_group2 = grouped.get_group(2)
    df_group3 = grouped.get_group(3)

    g = pd.concat(
        [
            df_group1.mean().rename("C1"),
            df_group2.mean().rename("C2"),
            df_group3.mean().rename("C3"),
        ],
        axis=1,
    )
    return g

所以函数的输入应该是这样的。

if __name__ == "__main__":
    filename = "Data.xlsx"
    dataframe = prepare_df(filename)
    width = 0.7
    ax = dataframe.plot.bar(width=width, figsize=(9, 2))
    # this plot will group in sets of 3
    patches = ax.patches._axes.axes.containers[0].patches
    patches.extend(ax.patches._axes.axes.containers[1].patches)
    patches.extend(ax.patches._axes.axes.containers[2].patches)
    annotate_barplot_dataframe(0, 1, "*", patches, 0.1)
    annotate_barplot_dataframe(1, 2, "*", patches, 0.1)

    plt.savefig(fname="filename.pdf", bbox_inches="tight")
    plt.show()

结果将保存到磁盘的图片如下

相关问题