import pandas as pd


df = pd.DataFrame(
    {
        "Name": [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. William Henry",
            "Bonnell, Miss. Elizabeth",
        ],
        "Age": [22, 35, 58],
        "Sex": ["male", "male", "female"],
    }
)


print(df)

                       Name  Age     Sex
0   Braund, Mr. Owen Harris   22    male
1  Allen, Mr. William Henry   35    male
2  Bonnell, Miss. Elizabeth   58  female

df


xx = df["Age"]

xx

0    22
1    35
2    58
Name: Age, dtype: int64


df["Age"].max()

58


xx.min()

22


xx.mean()

38.333333333333336


xx.std()

18.230011885167087

df


## Describing numerical data

df.describe()


dict_1 = {"Country": ["Brazil", "Russia", "India", "China", "South Africa"],
       "Capital": ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
       "Area": [8.516, 17.10, 3.286, 9.597, 1.221],
       "Population": [200.4, 143.5, 1252, 1357, 52.98] }

print(dict_1)

{'Country': ['Brazil', 'Russia', 'India', 'China', 'South Africa'], 'Capital': ['Brasilia', 'Moscow', 'New Dehli', 'Beijing', 'Pretoria'], 'Area': [8.516, 17.1, 3.286, 9.597, 1.221], 'Population': [200.4, 143.5, 1252, 1357, 52.98]}


brics = pd.DataFrame(dict_1)


brics


brics.head(3)


print(brics)

        Country    Capital    Area  Population
0        Brazil   Brasilia   8.516      200.40
1        Russia     Moscow  17.100      143.50
2         India  New Dehli   3.286     1252.00
3         China    Beijing   9.597     1357.00
4  South Africa   Pretoria   1.221       52.98


# Set the index for brics
brics.index = ["BR", "RU", "IN", "CH", "SA"]

# Print out brics with new index values
print(brics)

         Country    Capital    Area  Population
BR        Brazil   Brasilia   8.516      200.40
RU        Russia     Moscow  17.100      143.50
IN         India  New Dehli   3.286     1252.00
CH         China    Beijing   9.597     1357.00
SA  South Africa   Pretoria   1.221       52.98


## Describing numerical data 

brics.describe()


brics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, BR to SA
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     5 non-null      object 
 1   Capital     5 non-null      object 
 2   Area        5 non-null      float64
 3   Population  5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 200.0+ bytes


import numpy as np
dict_2 = {"Name": ["Alice", "Juliet", "Alex", "Sara", "Oliver"],
         "Statistical Inference": [87,np.nan,96,65,87],
         "Statistical Methods": [94,87,58,63,72],
         "Vectors & Matrices":[92,74,57,96,88],
         "Remarks":['Excellent','Good','Good','Good','Very Good']}
print(dict_2)

{'Name': ['Alice', 'Juliet', 'Alex', 'Sara', 'Oliver'], 'Statistical Inference': [87, nan, 96, 65, 87], 'Statistical Methods': [94, 87, 58, 63, 72], 'Vectors & Matrices': [92, 74, 57, 96, 88], 'Remarks': ['Excellent', 'Good', 'Good', 'Good', 'Very Good']}


brics1 = pd.DataFrame(dict_2)
print(brics1)

     Name  Statistical Inference  Statistical Methods  Vectors & Matrices  \
0   Alice                   87.0                   94                  92   
1  Juliet                    NaN                   87                  74   
2    Alex                   96.0                   58                  57   
3    Sara                   65.0                   63                  96   
4  Oliver                   87.0                   72                  88   

     Remarks  
0  Excellent  
1       Good  
2       Good  
3       Good  
4  Very Good


brics1


brics1.describe()


brics1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Name                   5 non-null      object 
 1   Statistical Inference  4 non-null      float64
 2   Statistical Methods    5 non-null      int64  
 3   Vectors & Matrices     5 non-null      int64  
 4   Remarks                5 non-null      object 
dtypes: float64(1), int64(2), object(2)
memory usage: 328.0+ bytes


brics1["Vectors & Matrices"]

0    92
1    74
2    57
3    96
4    88
Name: Vectors & Matrices, dtype: int64


brics1.index = ["P1", "P2", "P3", "P4", "P5"]
print(brics1)

      Name  Statistical Inference  Statistical Methods  Vectors & Matrices  \
P1   Alice                   87.0                   94                  92   
P2  Juliet                    NaN                   87                  74   
P3    Alex                   96.0                   58                  57   
P4    Sara                   65.0                   63                  96   
P5  Oliver                   87.0                   72                  88   

      Remarks  
P1  Excellent  
P2       Good  
P3       Good  
P4       Good  
P5  Very Good


brics1


brics1.shape

(5, 5)


## Write to EXCEL

brics1.to_excel("brics_test.xlsx", sheet_name="brics_1", index=True)


## Write to CSV

brics1.to_csv("brics_test.csv", header=True, index=False)


## Read from EXCEL

titanic = pd.read_excel("brics_test.xlsx", sheet_name="brics_1", index_col=0)
print(titanic)

      Name  Statistical Inference  Statistical Methods  Vectors & Matrices  \
P1   Alice                   87.0                   94                  92   
P2  Juliet                    NaN                   87                  74   
P3    Alex                   96.0                   58                  57   
P4    Sara                   65.0                   63                  96   
P5  Oliver                   87.0                   72                  88   

      Remarks  
P1  Excellent  
P2       Good  
P3       Good  
P4       Good  
P5  Very Good


titanic


titanic.head(3)


## Read from EXCEL

titanic_csv = pd.read_csv("brics_test.csv")#, index_col=0) ## include index_col only when index is present in the csv file
print(titanic_csv)

     Name  Statistical Inference  Statistical Methods  Vectors & Matrices  \
0   Alice                   87.0                   94                  92   
1  Juliet                    NaN                   87                  74   
2    Alex                   96.0                   58                  57   
3    Sara                   65.0                   63                  96   
4  Oliver                   87.0                   72                  88   

     Remarks  
0  Excellent  
1       Good  
2       Good  
3       Good  
4  Very Good


titanic_csv


titanic_csv.head(2)


capitals = titanic_csv["Remarks"]


capitals.head()

0    Excellent
1         Good
2         Good
3         Good
4    Very Good
Name: Remarks, dtype: object


type(titanic_csv["Remarks"])

pandas.core.series.Series


type(capitals)

pandas.core.series.Series


capitals.shape

(5,)


cap_area = titanic_csv[["Name", "Remarks"]]
cap_area


cap_area.shape

(5, 2)


## Remove index_col=0 and excute the earlier cells, see the change


### Data fetching based on conditions

titanic_subset = titanic_csv[titanic_csv["Statistical Methods"] > 70.0]


titanic_subset


titanic_csv


class_23 = titanic_csv[titanic_csv["Remarks"].isin(["Excellent", "Very Good"])]
class_23


## AND (&) OR (|) 

cond = titanic_csv[(titanic_csv["Remarks"] == "Good") | (titanic_csv["Vectors & Matrices"] >= 90.00)]
cond


titanic_csv[(titanic_csv["Remarks"] == "Good") | (titanic_csv["Vectors & Matrices"] >= 90.00)]


cond = titanic_csv[(titanic_csv["Remarks"] == "Good") & (titanic_csv["Vectors & Matrices"] >= 70.00)]
cond


## Display only NOT NULL values

area_no_na = titanic_csv[titanic_csv["Statistical Inference"].notna()]
area_no_na


titanic_csv


test = titanic_csv.iloc[2:3, 1:3]


test


ti = titanic_csv.copy()
ti


ti.iloc[2:3, 1:3] = np.nan

ti


titanic_csv


s = pd.Series([20, 40, 50, np.nan, 70], index=[0,1,2,3,4])

s

0    20.0
1    40.0
2    50.0
3     NaN
4    70.0
dtype: float64


titanic_csv['Python'] = s.values


titanic_csv


titanic_csv['1st'] = titanic_csv["Statistical Inference"] > 50


xx = titanic_csv[["Statistical Inference", "Statistical Methods", "Vectors & Matrices"]]
xx


titanic_csv['2nd'] = xx.mean(axis=1)


titanic_csv['3rd'] = xx.max(axis=1)


titanic_csv


titanic_csv


titanic_csv["new"] = titanic_csv["Statistical Methods"] * 1.882
titanic_csv


titanic_csv["new_1"] = titanic_csv["Statistical Methods"] / titanic_csv["Vectors & Matrices"]
titanic_csv


titanic_csv = titanic_csv.rename(
    columns={
        "new": "Multiply",
        "new_1": "Ratio",
    }
)


titanic_csv


titanic_csv = titanic_csv.rename(columns=str.lower)
titanic_csv.head()


titanic_csv = titanic_csv.rename(columns=str.capitalize)
titanic_csv.head()


titanic_csv = titanic_csv.rename(columns=str.upper)
titanic_csv.head()


titanic_csv.sort_values(by="STATISTICAL METHODS", ascending=True).head()


titanic_csv_1 = titanic_csv.sort_values(by="STATISTICAL METHODS", ascending=True).copy()
titanic_csv_1.head()


titanic_csv


good = titanic_csv[titanic_csv["REMARKS"] == "Good"].copy()
good


s = pd.Series(["tezpur", "kolkata", "tezpur"], index=[1,2,3])
good["Location"] = s.values


good


titanic_csv


titanic_csv.index = ["BR", "RU", "IN", "CH", "SA"]
titanic_csv.head()


brics


merged = pd.concat([titanic_csv, brics], axis=1)
merged.head()


titanic_csv.index = [0, 1, 2, 3, 4]
titanic_csv.head()


x1 = titanic_csv[0:3].copy()
x1


x2 = titanic_csv[3:5].copy()
x2


x1_x2 = pd.concat([x1, x2], axis=0)
x1_x2.head()


titi = titanic_csv.sort_values("STATISTICAL METHODS")
titi


titanic_csv


x1.head()


x2.head()


x1x2 = pd.concat([x1, x2], keys=["X1", "X2"])


x1x2


x1x2["NAME"].str.lower()

X1  0     alice
    1    juliet
    2      alex
X2  3      sara
    4    oliver
Name: NAME, dtype: object


x1x2["REMARKS"].str.split(" ")

X1  0     [Excellent]
    1          [Good]
    2          [Good]
X2  3          [Good]
    4    [Very, Good]
Name: REMARKS, dtype: object


x1x2["REMARKS"].str.split(" ").str.get(1)

X1  0     NaN
    1     NaN
    2     NaN
X2  3     NaN
    4    Good
Name: REMARKS, dtype: object


x1x2["REMARKS"].str.contains("Good")

X1  0    False
    1     True
    2     True
X2  3     True
    4     True
Name: REMARKS, dtype: bool


x1x2[x1x2["REMARKS"].str.contains("Good")]


x1x2


#Longest name 
x1x2["NAME"].str.len()

X1  0    5
    1    6
    2    4
X2  3    4
    4    6
Name: NAME, dtype: int64


x1x2["NAME"].str.len().idxmax()

('X1', 1)


# iloc: location by index of rows and columns
# loc: location by labels of rows and columns


x1x2.loc[x1x2["NAME"].str.len().idxmax(), "NAME"]

'Juliet'


x1x2["Rem:Short"] = x1x2["REMARKS"].replace({"Good": "G", "Very Good": "V", "Excellent": "E"})


x1x2


x1x2.columns

Index(['NAME', 'STATISTICAL INFERENCE', 'STATISTICAL METHODS',
       'VECTORS & MATRICES', 'REMARKS', 'PYTHON', '1ST', '2ND', '3RD',
       'MULTIPLY', 'RATIO', 'Rem:Short'],
      dtype='object')


x1x2=x1x2.rename(columns={'Rem:Short':'RM'})
x1x2.columns

Index(['NAME', 'STATISTICAL INFERENCE', 'STATISTICAL METHODS',
       'VECTORS & MATRICES', 'REMARKS', 'PYTHON', '1ST', '2ND', '3RD',
       'MULTIPLY', 'RATIO', 'RM'],
      dtype='object')


x1x2


x1x2 = x1x2.rename(columns={'RM':'Rem:Short'})
x1x2.columns

Index(['NAME', 'STATISTICAL INFERENCE', 'STATISTICAL METHODS',
       'VECTORS & MATRICES', 'REMARKS', 'PYTHON', '1ST', '2ND', '3RD',
       'MULTIPLY', 'RATIO', 'Rem:Short'],
      dtype='object')


x1x2


x1x2.columns = x1x2.columns.str.replace(' ', '_')
x1x2.columns

Index(['NAME', 'STATISTICAL_INFERENCE', 'STATISTICAL_METHODS',
       'VECTORS_&_MATRICES', 'REMARKS', 'PYTHON', '1ST', '2ND', '3RD',
       'MULTIPLY', 'RATIO', 'Rem:Short'],
      dtype='object')


x1x2


### Dropping the columns


# Column drop
x1x2.drop('Rem:Short', axis=1, inplace=False).head()
# x1x2.head()


x1x2


x1x2.drop('Rem:Short', axis=1, inplace=False)
x1x2.head()


temp_x = x1x2.copy()
temp_x.drop(['Rem:Short', 'STATISTICAL_METHODS'], axis=1, inplace=True)


temp_x


# Row drop
temp_x.drop(temp_x.index[[0, 1]], axis=0, inplace=True)


temp_x


x1x2


# The loc method is used to select rows and columns by label.
# The iloc method is used to select rows and columns by integer position

x1x2.loc[('X1',0):('X2',3), :] # last inclusive


x1x2.tail(4)


x1x2_T = x1x2.T
x1x2_T


x1x2


x1x2.sample(n=2)


x1x2


x1x2.loc[5] = ["Oliver", 87.0, 72, 88, "Very Good", 70.0, True, 82.333333, 88.0, 135.504, 0.818182, "V"]


x1x2


x1x2.loc[len(x1x2.index)]=["Oliver", 87.0, 72, 88, "Very Good", 135.504, True, 0.818182, 95, 110.00, 1.01, "V"]


x1x2


data = [{"NAME": "Oliver", "STATISTICAL_INFERENCE": 87.0, "STATISTICAL_METHODS": 72, "VECTORS_&_MATRICES": 88, "REMARKS": "Very Good", "PYTHON": 135.504, "1ST": True, "2ND": 0.818182, "3RD": 95, "MULTIPLY": 110.00, "RATIO": 1.01, "Rem:Short": "V"}]


x1x2 = x1x2.append(data, ignore_index=True, sort=False)
x1x2


data = [{"NAME": "Oliver", "STATISTICAL_INFERENCE": 87.0, "STATISTICAL_METHODS": 72, "VECTORS_&_MATRICES": 88, "REMARKS": "Very Good", "PYTHON": 13509, "1ST": True, "2ND": 0.818182, "3RD": 95, "MULTIPLY": 110.00, "RATIO": 1.01, "Rem:Short": "V"}]


x1x2 = x1x2.append(data, ignore_index=False, sort=False)


x1x2


df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y'])
df


df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), index=['x', 'y'])
df


df.append(df2)


df.append(df2, ignore_index=True)


# Inserting column in any position
x1x2.insert(1, "D", 5) # Inplace opeartion


x1x2


x1x2.insert(1, "D_new", [5, 5,55,5,60,30,20, 34, 35])
x1x2


x1x2["C"] = [10, 20, 30, 40, 100, 20, 1, 56, 45]
x1x2


import numpy as np
x1x2.loc[:, "E"] = list(np.random.rand(9))
x1x2


x1x2_copy = x1x2.assign(G = x1x2.E * 100) # condition is column label should be single letter
x1x2_copy


x1x2.drop(['D_new', 'C', 'E'], axis=1, inplace=True)
x1x2


x1x2_n = x1x2.copy()


x1x2.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
0    False
dtype: bool


x1x2 = x1x2[~x1x2.duplicated()]


x1x2


# Remove duplicates
x1x2_n.drop_duplicates() # What is other techniques using  x1x2.duplicated()?

	Age
count	3.000000
mean	38.333333
std	18.230012
min	22.000000
25%	28.500000
50%	35.000000
75%	46.500000
max	58.000000

	Area	Population
count	5.000000	5.000000
mean	7.944000	601.176000
std	6.200557	645.261454
min	1.221000	52.980000
25%	3.286000	143.500000
50%	8.516000	200.400000
75%	9.597000	1252.000000
max	17.100000	1357.000000

	Statistical Inference	Statistical Methods	Vectors & Matrices
count	4.000000	5.000000	5.000000
mean	83.750000	74.800000	81.400000
std	13.200379	15.385058	15.962456
min	65.000000	58.000000	57.000000
25%	81.500000	63.000000	74.000000
50%	87.000000	72.000000	88.000000
75%	89.250000	87.000000	92.000000
max	96.000000	94.000000	96.000000

Tutorial on Python Library: pandas

1. Import library

2. Intro

3. From Dictionary to Dataframe

4. Write and Read to/from csv or excel files

5. Subset selection and applying formulas to new columns¶

6. Index Location¶

7. Renaming column headers¶

8. Sorting the table¶

9. Text manipulation¶

10. Renaming the column labels¶

11. Random sampling¶

12. Inserting rows and columns¶

13. Duplicates removal¶

References

	Name	Age	Sex
0	Braund, Mr. Owen Harris	22	male
1	Allen, Mr. William Henry	35	male
2	Bonnell, Miss. Elizabeth	58	female

	Country	Capital	Area	Population
0	Brazil	Brasilia	8.516	200.40
1	Russia	Moscow	17.100	143.50
2	India	New Dehli	3.286	1252.00
3	China	Beijing	9.597	1357.00
4	South Africa	Pretoria	1.221	52.98

	Name	Statistical Inference	Statistical Methods	Vectors & Matrices	Remarks
0	Alice	87.0	94	92	Excellent
1	Juliet	NaN	87	74	Good
2	Alex	96.0	58	57	Good
3	Sara	65.0	63	96	Good
4	Oliver	87.0	72	88	Very Good

	Name	Statistical Inference	Statistical Methods	Vectors & Matrices	Remarks
0	Alice	87.0	94.0	92	Excellent
1	Juliet	NaN	87.0	74	Good
2	Alex	NaN	NaN	57	Good
3	Sara	65.0	63.0	96	Good
4	Oliver	87.0	72.0	88	Very Good

	NAME	STATISTICAL INFERENCE	STATISTICAL METHODS	VECTORS & MATRICES	REMARKS	PYTHON	1ST	2ND	3RD	MULTIPLY	RATIO
1	Juliet	NaN	87	74	Good	40.0	False	80.500000	87.0	163.734	1.175676
2	Alex	96.0	58	57	Good	50.0	True	70.333333	96.0	109.156	1.017544
3	Sara	65.0	63	96	Good	NaN	True	74.666667	96.0	118.566	0.656250

	X1			X2
	0	1	2	3	4
NAME	Alice	Juliet	Alex	Sara	Oliver
STATISTICAL_INFERENCE	87.0	NaN	96.0	65.0	87.0
STATISTICAL_METHODS	94	87	58	63	72
VECTORS_&_MATRICES	92	74	57	96	88
REMARKS	Excellent	Good	Good	Good	Very Good
PYTHON	20.0	40.0	50.0	NaN	70.0
1ST	True	False	True	True	True
2ND	91.0	80.5	70.333333	74.666667	82.333333
3RD	94.0	87.0	96.0	96.0	88.0
MULTIPLY	176.908	163.734	109.156	118.566	135.504
RATIO	1.021739	1.175676	1.017544	0.65625	0.818182
Rem:Short	E	G	G	G	V

	NAME	STATISTICAL_INFERENCE	STATISTICAL_METHODS	VECTORS_&_MATRICES	REMARKS	PYTHON	1ST	2ND	3RD	MULTIPLY	RATIO	Rem:Short
(X1, 0)	Alice	87.0	94	92	Excellent	20.0	True	91.000000	94.0	176.908	1.021739	E
(X1, 1)	Juliet	NaN	87	74	Good	40.0	False	80.500000	87.0	163.734	1.175676	G
(X1, 2)	Alex	96.0	58	57	Good	50.0	True	70.333333	96.0	109.156	1.017544	G
(X2, 3)	Sara	65.0	63	96	Good	NaN	True	74.666667	96.0	118.566	0.656250	G
(X2, 4)	Oliver	87.0	72	88	Very Good	70.0	True	82.333333	88.0	135.504	0.818182	V
5	Oliver	87.0	72	88	Very Good	70.0	True	82.333333	88.0	135.504	0.818182	V