csv Pandas:关于如何在pandas框架中加载数据有什么想法吗

biswetbf  于 12个月前  发布在  其他
关注(0)|答案(4)|浏览(92)

大家好,我是Pandas的新手
我有多个CSV文件,如下所示:

john_age.csv
john_gender.csv
john_weight.csv
mike_age.csv
mike_gender.csv
mike_weight.csv
smith_age.csv
smith_gender.csv
smith_weight.csv
...
...

每个csv文件都有一个简单的字符串或数字,如下所示:

john_age.csv       54
john_gender.csv    male
john_weight.csv    65.4

基本上,我想让整个dataframe看起来像这样:

age    gender    weight      
john     54     male      65.4
mike     23     male      86.5
smith    52     female    54

我怎样才能做到这一点?
我认为关键的想法是将每个csv文件名合并到dataframe中,但到目前为止,我只能使用glob.glob和append函数读取多个csv文件,但append函数不是解决方案:

csv_path = \mypath\  

filenames = glob.glob(csv_path + '\*.csv')

dfs= []

for file in filenames:
    dfs.append(pd.read_csv(file))

提前感谢!!

r1wp621o

r1wp621o1#

这将从文件中创建一个框架。

import glob
import pandas as pd

csv_path = 'csvs'  

filenames = glob.glob(csv_path + r'\*_age.csv')

people = []
attrs =['age', 'gender', 'weight']

for file in filenames:
    person = {}
    name = file[5:].split('_')[0]
    print(name)
    for attr in attrs:
        person['name'] = name
        with open(f'{csv_path}\{name}_{attr}.csv', 'r') as data_file:
            data = data_file.readline() 
            person[attr] = data
    people.append(person)
    
df = pd.DataFrame(people)

print(df)
2skhul33

2skhul332#

我说的就是这种事

with open('combined.csv','w') as combine:
    for fn in glob.glob(csv_path+'\*_age.csv'):
        name = os.path.basename(f).split('_')[0]
        fields = [name]
        for part in ('age','gender','weight'):
            fields.append( open(f"{cvs_path}\{name}_{part}.csv").read().strip() )
        print( ','.join(fields), file=combine )

dfs = pd.read_cvs('combined.csv')
0vvn1miw

0vvn1miw3#

您可以使用pd.concat()在一行中完成此操作

from glob import glob
import pandas as pd

files = glob(“path/to/files/*.csv”)
files.sort()

data = pd.concat((pd.read_csv(file) for file in files), ignore_index=True, header=0, names=[“age”, “gender”, “weight”])
xtupzzrd

xtupzzrd4#

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns #to ignore
        warnings import warnings warnings.filterwarnings('ignore')
        
        data = pd.read_csv("used_cars.csv")
        
        data.head()
        
        data.tail()
        
        data.info()
        
        data.nunique()
        
        data.isnull().sum()
        
        (data.isnull().sum()/(len(data)))*100
        
        # Remove S.No. column from data data = data.drop(['S.No.'], axis = 1) data.info()
        
        from datetime import date date.today().year data['Car_Age']=date.today().year-data['Year'] data.head()
        
        data['Brand'] = data.Name.str.split().str.get(0)
        data['Model'] = data.Name.str.split().str.get(1) + data.Name.str.split().str.get(2)
        data[['Name','Brand','Model']]
        
        print(data.Brand.unique()) print(data.Brand.nunique())
        
        searchfor = ['Isuzu' ,'ISUZU','Mini','Land'] data[data.Brand.str.contains('|'.join(searchfor))].head(5)
        
        data["Brand"].replace({"ISUZU": "Isuzu", "Mini": "Mini Cooper","Land":"Land Rover"}, inplace=True)
        
        data.describe().T
        
        data.describe(include='all').T
        
        cat_cols=data.select_dtypes(include=['object']).columns num_cols =
        data.select_dtypes(include=np.number).columns.tolist() print("Categorical Variables:") print(cat_cols)
        print("Numerical Variables:") print(num_cols)
        
        for col in num_cols: print(col) print('Skew :', round(data[col].skew(), 2)) plt.figure(figsize = (15, 4))
        plt.subplot(1, 2, 1) data[col].hist(grid=False) plt.ylabel('count') plt.subplot(1, 2, 2)
        sns.boxplot(x=data[col]) plt.show()
        
        fig, axes = plt.subplots(3, 2, figsize = (18, 18)) fig.suptitle('Bar plot for all categorical variables in
        the dataset') sns.countplot(ax = axes[0, 0], x = 'Fuel_Type', data = data, color = 'blue', order =
        data['Fuel_Type'].value_counts().index); sns.countplot(ax = axes[0, 1], x = 'Transmission', data = data,
        color = 'blue', order = data['Transmission'].value_counts().index); sns.countplot(ax = axes[1, 0], x =
        'Owner_Type', data = data, color = 'blue', order = data['Owner_Type'].value_counts().index); sns.countplot(ax
        = axes[1, 1], x = 'Location', data = data, color = 'blue', order = data['Location'].value_counts().index);
        sns.countplot(ax = axes[2, 0], x = 'Brand', data = data, color = 'blue', order =
        data['Brand'].head(20).value_counts().index); sns.countplot(ax = axes[2, 1], x = 'Model', data = data, color
        = 'blue', order = data['Model'].head(20).value_counts().index); axes[1][1].tick_params(labelrotation=45);
        axes[2][0].tick_params(labelrotation=90); axes[2][1].tick_params(labelrotation=90);
        
        
        # Function for log transformation of the column def log_transform(data,col): for colname in col: if
        (data[colname] == 1.0).all(): data[colname + '_log'] = np.log(data[colname]+1) else: data[colname + '_log'] =
        np.log(data[colname]) data.info()
        log_transform(data,['Kilometers_Driven','Price'])
        #Log transformation of the feature 'Kilometers_Driven' sns.distplot(data["Kilometers_Driven_log"],
        axlabel="Kilometers_Driven_log");
        
        
        plt.figure(figsize=(13,17)) sns.pairplot(data=data.drop(['Kilometers_Driven','Price'],axis=1)) plt.show()
        
        
        fig, axarr = plt.subplots(4, 2, figsize=(12, 18)) data.groupby('Location')
        ['Price_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[0][0], fontsize=12) axarr[0]
        [0].set_title("Location Vs Price", fontsize=18) data.groupby('Transmission')
        ['Price_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[0][1], fontsize=12) axarr[0]
        [1].set_title("Transmission Vs Price", fontsize=18) data.groupby('Fuel_Type')
        ['Price_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[1][0], fontsize=12) axarr[1]
        [0].set_title("Fuel_Type Vs Price", fontsize=18) data.groupby('Owner_Type')
        ['Price_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[1][1], fontsize=12) axarr[1]
        [1].set_title("Owner_Type Vs Price", fontsize=18) data.groupby('Brand')
        ['Price_log'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[2][0], fontsize=12) axarr[2]
        [0].set_title("Brand Vs Price", fontsize=18) data.groupby('Model')
        ['Price_log'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[2][1], fontsize=12) axarr[2]
        [1].set_title("Model Vs Price", fontsize=18) data.groupby('Seats')
        ['Price_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[3][0], fontsize=12) axarr[3]
        [0].set_title("Seats Vs Price", fontsize=18) data.groupby('Car_Age')
        ['Price_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[3][1], fontsize=12) axarr[3]
        [1].set_title("Car_Age Vs Price", fontsize=18) plt.subplots_adjust(hspace=1.0) plt.subplots_adjust(wspace=.5)
        sns.despine()
        
        
        plt.figure(figsize=(12, 7)) sns.heatmap(data.drop(['Kilometers_Driven','Price'],axis=1).corr(), annot = True,
        vmin = -1, vmax = 1) plt.show()
        
        data.loc[data["Mileage"]==0.0,'Mileage']=np.nan data.Mileage.isnull().sum()
        data['Mileage'].fillna(value=np.mean(data['Mileage']),inplace=True)
        
        data.Seats.isnull().sum() data['Seats'].fillna(value=np.nan,inplace=True)
        data['Seats']=data.groupby(['Model','Brand'])['Seats'].apply(lambda x:x.fillna(x.median()))
        data['Engine']=data.groupby(['Brand','Model'])['Engine'].apply(lambda x:x.fillna(x.median()))
        data['Power']=data.groupby(['Brand','Model'])['Power'].apply(lambda x:x.fillna(x.median()))
        
        #Athlete dataset:
df = pd.concat([df1, df2,df3,df4], ignore_index=True)
#Converting all columns to objects
for i in list(df.columns):
    df[i] = df[i].apply(str)
    df[i] = df[i].apply(str.strip) #Strip function removes unwanted empty spaces

df = df.replace("Null", np.nan)
        df.isna().sum()
        collist = ['Team','Medal','ID']
        df2 = df[collist].groupby(["Team","Medal"]).count()
        df2 = df2.reset_index()
        df2= df2[df2["Medal"] == "Gold"].sort_values(by="ID",ascending=False)[:10]
        df2
    # Filter out rows with missing medals
    medalists = df[df['Medal'].notna()]
    
    # Group by country and count the number of medals
    country_medals = medalists['Team'].value_counts()
    
    # Get the top 10 countries with the highest number of medals
    top_countries = country_medals.head(10)
    
    # Display the result
    print(top_countries)
    
    df[['Height','Age','Weight']].aggregate(['max','min'])
    df["Sport"].value_counts()
    collist = ['Sport','Team','ID']
    df_chart1 = df[collist].groupby(["Sport","Team"]).count()
    df_chart1
    df_chart1 = df_chart1.sort_values(by=["Sport","ID"],ascending=False)[:10].reset_index()
    df_chart1

#converting categorical columns values to numerical values for analysis
df['status'].replace(['Ready to move', 'Under Construction','Unfurnished'],[0, 1, 2], inplace=True)
df['nature'].replace(['New','Resale'], [0, 1], inplace=True)

df[['builder_exp']].sample(10)
#Converting age column to numeric
df['age'] = df['age'].str.replace(' years','')
df['age'] = df['age'].str.replace(' year','')

#Dropping column description and rate as they have no relevance to analysis of data
df.drop(columns=['rate','description'],inplace=True) 

#Imputing
df.score = df.score.astype(float) #Converting type of 'score' column to float for imputing
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
df['score'] = imp_median.fit_transform(df[['score']])

plt.figure(figsize=(10,6))
sns.barplot(x="Team",y="ID",data=df_chart1)

相关问题