In [8]:
# =========================
# Importing a CSV (PANDAS)
# =========================

# To work with CSV files in Python, we use the pandas library
import pandas as pd
# -------------------------
# Basic CSV import
# -------------------------

# The most common way to import a CSV is using:
df = pd.read_csv("data.csv")

# This reads the file and stores it in a DataFrame (df)
# A DataFrame is like a table (rows and columns)
In [9]:
# -------------------------
# Viewing the data
# -------------------------

# Show the first 5 rows
df.head()
Out[9]:
name age city department salary years_experience is_full_time start_date
0 Alice 25 Rotterdam Marketing 3200 2 True 2022-06-15
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
3 Diana 22 The Hague Marketing 2800 1 True 2023-01-20
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
In [76]:
# Show basic information about the dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              10 non-null     object
 1   age               10 non-null     int64 
 2   city              10 non-null     object
 3   department        10 non-null     object
 4   salary            10 non-null     int64 
 5   years_experience  10 non-null     int64 
 6   is_full_time      10 non-null     bool  
 7   start_date        10 non-null     object
dtypes: bool(1), int64(3), object(4)
memory usage: 698.0+ bytes
In [77]:
# Show summary statistics (only for numeric columns)
df.describe()
Out[77]:
age salary years_experience
count 10.0000 10.00000 10.000000
mean 29.1000 3860.00000 5.300000
std 5.3427 911.89668 4.270051
min 22.0000 2800.00000 1.000000
25% 25.5000 3275.00000 2.250000
50% 28.5000 3650.00000 4.500000
75% 30.7500 4150.00000 5.750000
max 40.0000 6000.00000 15.000000
In [78]:
# -------------------------
# Different separators
# -------------------------

# By default, pandas assumes the file is separated by commas (,)
# But sometimes other separators are used

# Example CSV with semicolon (common in Europe):
# name;age;city;salary
# Alice;25;Rotterdam;3200

# In that case, you must specify the separator:
df = pd.read_csv("data.csv", sep=",")
In [79]:
# -------------------------
# Common issues
# -------------------------

# Problem: Wrong separator
# If your columns look wrong (everything in one column),
# you probably need to change 'sep'


# Problem: Encoding errors
# Sometimes CSV files use different text encodings
df = pd.read_csv("data.csv", encoding="utf-8")
# or:
df = pd.read_csv("data.csv", encoding="latin-1")
In [80]:
# -------------------------
# Printing a column (Series)
# -------------------------

# You can access a column by using its name:
city = df["city"]
print(city)
print("====================================================================================")

# Output will look like:
# 0    Rotterdam
# 1    Amsterdam
# 2    Utrecht
# ...

# The numbers on the left (0, 1, 2, ...) are the index.

# What is the index?
# - It is a number automatically added by pandas
# - It shows the position of each row in the table
# - It is NOT part of the actual data (like city or name)

# Think of it like a row number in a table:
# Row 0 = first row
# Row 1 = second row
# etc.

# You can use the index to get a specific value:
print(city[0])  # Rotterdam
0    Rotterdam
1    Amsterdam
2      Utrecht
3    The Hague
4    Eindhoven
5    Rotterdam
6    Amsterdam
7      Utrecht
8    Eindhoven
9    The Hague
Name: city, dtype: object
====================================================================================
Rotterdam
In [81]:
# We can verify the type by printing the type
print(type(city))
<class 'pandas.core.series.Series'>
In [82]:
# -------------------------
# Setting a custom index
# -------------------------

# By default, pandas uses a numeric index (0, 1, 2, ...).
# But you can choose one of the columns to be the index instead.

# Example: using the "name" column as the index
df_indexed = df.set_index("name")

print(df_indexed)

# Now the index will be:
# Alice, Bob, Charlie, etc.
# instead of numbers like 0, 1, 2
         age       city   department  salary  years_experience  is_full_time  \
name                                                                           
Alice     25  Rotterdam    Marketing    3200                 2          True   
Bob       30  Amsterdam  Engineering    4000                 5          True   
Charlie   28    Utrecht        Sales    3500                 3         False   
Diana     22  The Hague    Marketing    2800                 1          True   
Ethan     35  Eindhoven  Engineering    4500                10          True   
Fiona     27  Rotterdam        Sales    3600                 4         False   
George    40  Amsterdam   Management    6000                15          True   
Hannah    24    Utrecht  Engineering    3100                 2          True   
Ivan      31  Eindhoven        Sales    4200                 6         False   
Julia     29  The Hague    Marketing    3700                 5          True   

         start_date  
name                 
Alice    2022-06-15  
Bob      2020-03-10  
Charlie  2021-09-01  
Diana    2023-01-20  
Ethan    2015-11-05  
Fiona    2021-04-18  
George   2010-07-30  
Hannah   2022-10-12  
Ivan     2019-02-25  
Julia    2018-08-09  
In [83]:
# -------------------------
# Why use a custom index?
# -------------------------

# Using a meaningful column as the index can make data easier to work with.

# For example:
# - Each person has a unique name in this dataset
# - So we can quickly look up a person by name

# Example:
df_indexed.loc["Alice"]

# This returns all information about Alice in one row.


# -------------------------
# When is this useful?
# -------------------------

# A custom index is useful when:
# - The column has unique values (no duplicates)
# - You want to quickly look up rows by a meaningful label
# - The index represents something "real" (like name, ID, date, etc.)

# In this dataset, "name" works well because:
# - Each row represents one person
# - Names are easy to understand and use for lookup
Out[83]:
age                         25
city                 Rotterdam
department           Marketing
salary                    3200
years_experience             2
is_full_time              True
start_date          2022-06-15
Name: Alice, dtype: object
In [84]:
# -------------------------
# We can further drill into Alice's information.
# -------------------------

# Let's say we want to find her age. It would be as simple as accessing her information as we just did and then specifying which column we want to display:
df_indexed.loc["Alice", "age"]

# This works because:
# - "Alice" is the index (row label)
# - "age" is the column name

# We can also access multiple columns for the same person:
df_indexed.loc["Alice", ["age", "city", "salary"]]

# This will return a small table with only the selected columns.
Out[84]:
age              25
city      Rotterdam
salary         3200
Name: Alice, dtype: object
In [85]:
# -------------------------
# Loc, column location
# -------------------------

# .loc is used to select data from a DataFrame using labels (names).


# The general format is:
# df.loc[row_label, column_label]

# In this example:
df_indexed.loc["Alice", "age"]

# "Alice"  -> row label (index value)
# "age"    -> column name
Out[85]:
25
In [86]:
# So .loc means:
# Go to the row with label 'Alice' and return the value in the 'age' column

# -------------------------
# Simple way to think about it
# -------------------------

# .loc[row, column]

# Row = which person (or entry) you want
# Column = which piece of information you want

# -------------------------
# Example
# -------------------------

# Get Alice's city:
df_indexed.loc["Alice", "city"]
Out[86]:
'Rotterdam'
In [87]:
# Get multiple columns for Alice:
df_indexed.loc["Alice", ["age", "city", "salary"]]
Out[87]:
age              25
city      Rotterdam
salary         3200
Name: Alice, dtype: object
In [88]:
# IMPORTANT: .loc uses labels (column names), not positions (numbers)

# This will NOT work because 0:2 are positions, not column names:
df_indexed.loc["Alice", 0:2]

# If you want to select columns by position, you should use .iloc instead
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_29212\2695698881.py in <module>
      2 
      3 # This will NOT work because 0:2 are positions, not column names:
----> 4 df_indexed.loc["Alice", 0:2]
      5 
      6 # If you want to select columns by position, you should use .iloc instead

~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
    959             if self._is_scalar_access(key):
    960                 return self.obj._get_value(*key, takeable=self._takeable)
--> 961             return self._getitem_tuple(key)
    962         else:
    963             # we by definition only have the 0th axis

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
   1138         with suppress(IndexingError):
   1139             tup = self._expand_ellipsis(tup)
-> 1140             return self._getitem_lowerdim(tup)
   1141 
   1142         # no multi-index, so validate all of the indexers

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
    889                     return section
    890                 # This is an elided recursive call to iloc/loc
--> 891                 return getattr(section, self.name)[new_key]
    892 
    893         raise IndexingError("not applicable")

~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
    965 
    966             maybe_callable = com.apply_if_callable(key, self.obj)
--> 967             return self._getitem_axis(maybe_callable, axis=axis)
    968 
    969     def _is_scalar_access(self, key: tuple):

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1181         if isinstance(key, slice):
   1182             self._validate_key(key, axis)
-> 1183             return self._get_slice_axis(key, axis=axis)
   1184         elif com.is_bool_indexer(key):
   1185             return self._getbool_axis(key, axis=axis)

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_slice_axis(self, slice_obj, axis)
   1215 
   1216         labels = obj._get_axis(axis)
-> 1217         indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
   1218 
   1219         if isinstance(indexer, slice):

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in slice_indexer(self, start, end, step, kind)
   6286         self._deprecated_arg(kind, "kind", "slice_indexer")
   6287 
-> 6288         start_slice, end_slice = self.slice_locs(start, end, step=step)
   6289 
   6290         # return a slice

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in slice_locs(self, start, end, step, kind)
   6496         start_slice = None
   6497         if start is not None:
-> 6498             start_slice = self.get_slice_bound(start, "left")
   6499         if start_slice is None:
   6500             start_slice = 0

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_slice_bound(self, label, side, kind)
   6405         # For datetime indices label may be a string that has to be converted
   6406         # to datetime boundary according to its resolution.
-> 6407         label = self._maybe_cast_slice_bound(label, side)
   6408 
   6409         # we need to look up the label

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _maybe_cast_slice_bound(self, label, side, kind)
   6352         # reject them, if index does not contain label
   6353         if (is_float(label) or is_integer(label)) and label not in self:
-> 6354             raise self._invalid_indexer("slice", label)
   6355 
   6356         return label

TypeError: cannot do slice indexing on Index with these indexers [0] of type int
In [89]:
# -------------------------
# iloc (index location)
# -------------------------

# .iloc is used to select data based on POSITION (numbers), not names.

# The general format is:
# df.iloc[row_position, column_position]

# -------------------------
# Example
# -------------------------

# Get Alice's row using position:
# This returns the first row in the dataset (Alice) 
# Keep in mind that df_indexed is indexed with Name as the index and Alice is the first name in the list
df_indexed.iloc[0]
Out[89]:
age                         25
city                 Rotterdam
department           Marketing
salary                    3200
years_experience             2
is_full_time              True
start_date          2022-06-15
Name: Alice, dtype: object
In [90]:
# Get a specific value using row and column positions:
df_indexed.iloc[0, 1]

# This means:
# - Row at position 0 (first row → Alice)
# - Column at position 1 (second column) this would be city in our dataset
Out[90]:
'Rotterdam'
In [91]:
# -------------------------
# Selecting a range of columns
# -------------------------
# Get the first row, and columns 0 to 2:


# Note:
# - 0:3 means columns at positions 0, 1, and 2
# - The end number (3) is NOT included

# -------------------------
# Simple rule
# -------------------------

# .loc  → uses names (labels)
# .iloc → uses positions (numbers)

# Use .iloc when you want to work with column/row positions
# instead of column names
In [92]:
# How to select multiple columns with .iloc
# You need to pass a list of column positions using []:
df_indexed.iloc[0, [0,3,5]]
Out[92]:
age               25
salary          3200
is_full_time    True
Name: Alice, dtype: object
In [93]:
# Similarly you can also select multiple rows 
df_indexed.iloc[0:2, [0,3,5]]
Out[93]:
age salary is_full_time
name
Alice 25 3200 True
Bob 30 4000 True
In [94]:
# or
df_indexed.iloc[:4, [0,3,5]]

# What does ":" mean in this context?

# ":" means "a range" or "from start to end".

# Example:
# 0:2 means:
# - start at index 0
# - go up to (but not including) index 2
# - rows at positions 0 and 1

# :4 means:
# - start from the beginning
# - go up to (but not including) index 4
# - rows at positions 0, 1, 2, 3

# So:
# df_indexed.iloc[:4, [0,3,5]]
# means:
# - take the first 4 rows
# - and only columns at positions 0, 3, and 5
Out[94]:
age salary is_full_time
name
Alice 25 3200 True
Bob 30 4000 True
Charlie 28 3500 False
Diana 22 2800 True
In [95]:
# Using negative indices (sometimes useful)
df_indexed.iloc[-1]

# This gets the last row in the dataset
Out[95]:
age                         29
city                 The Hague
department           Marketing
salary                    3700
years_experience             5
is_full_time              True
start_date          2018-08-09
Name: Julia, dtype: object
In [96]:
# In short. Iloc takes in 2 arguments. The row position and colum position:
# df.iloc[row_position, column_position]
In [97]:
# -------------------------
# Filtering data
# -------------------------

# Filtering means selecting rows based on a condition.

# The basic idea is:
# df[condition]

# The condition returns True or False for each row.
# Only rows with True are kept.

# Example 1: Filter people older than 30

df[df["age"] > 30]
Out[97]:
name age city department salary years_experience is_full_time start_date
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
In [98]:
# Example 1: Filter people older than 30

df[df["age"] > 30]
Out[98]:
name age city department salary years_experience is_full_time start_date
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
In [99]:
# Example 2: Filter by department

df[df["department"] == "Engineering"]
Out[99]:
name age city department salary years_experience is_full_time start_date
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
7 Hannah 24 Utrecht Engineering 3100 2 True 2022-10-12
In [100]:
# Example 3: Filter by multiple conditions
# Filter out everyone that is older than 25 and has a salary higher than 3500
df[(df["age"] > 25) & (df["salary"] > 3500)]
Out[100]:
name age city department salary years_experience is_full_time start_date
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
In [101]:
# Example 4: Full-time employees only

df[df["is_full_time"] == True]
Out[101]:
name age city department salary years_experience is_full_time start_date
0 Alice 25 Rotterdam Marketing 3200 2 True 2022-06-15
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
3 Diana 22 The Hague Marketing 2800 1 True 2023-01-20
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
7 Hannah 24 Utrecht Engineering 3100 2 True 2022-10-12
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
In [102]:
# Example 5: Filter by city

df[df["city"] == "Amsterdam"]
Out[102]:
name age city department salary years_experience is_full_time start_date
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
In [ ]:
# Notes:
# - Use () around each condition when combining them
# - Use:
#   &  for AND
#   |  for OR
# - Do NOT use "and" / "or" with pandas conditions
# - Always write conditions inside df[...]
In [103]:
# -------------------------
# Sorting data
# -------------------------

# You can sort a DataFrame using the sort_values() function.

# The basic format:
# df.sort_values(by="column_name")
# Example 1: Sort by age (ascending)

df.sort_values(by="age")
Out[103]:
name age city department salary years_experience is_full_time start_date
3 Diana 22 The Hague Marketing 2800 1 True 2023-01-20
7 Hannah 24 Utrecht Engineering 3100 2 True 2022-10-12
0 Alice 25 Rotterdam Marketing 3200 2 True 2022-06-15
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
In [104]:
# Example 2: Sort by salary (highest first)

df.sort_values(by="salary", ascending=False)
Out[104]:
name age city department salary years_experience is_full_time start_date
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
0 Alice 25 Rotterdam Marketing 3200 2 True 2022-06-15
7 Hannah 24 Utrecht Engineering 3100 2 True 2022-10-12
3 Diana 22 The Hague Marketing 2800 1 True 2023-01-20
In [105]:
# Example 3: Sort by multiple columns

df.sort_values(by=["department", "salary"])
Out[105]:
name age city department salary years_experience is_full_time start_date
7 Hannah 24 Utrecht Engineering 3100 2 True 2022-10-12
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
3 Diana 22 The Hague Marketing 2800 1 True 2023-01-20
0 Alice 25 Rotterdam Marketing 3200 2 True 2022-06-15
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
In [106]:
# -------------------------
# Combining filtering and sorting
# -------------------------

# You can first filter data, and then sort the result.

# Example: Show Engineering employees sorted by salary (highest first)

df[df["department"] == "Engineering"].sort_values(by="salary", ascending=False)
Out[106]:
name age city department salary years_experience is_full_time start_date
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
7 Hannah 24 Utrecht Engineering 3100 2 True 2022-10-12
In [108]:
# Example: Employees older than 25, sorted by age
# Note that we call a function on top of our initial filtering
df[df["age"] > 25].sort_values(by="age")
Out[108]:
name age city department salary years_experience is_full_time start_date
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
In [110]:
# The code can be split into 2.
# 1.First the filtering:
filtered_result = df[df["age"] > 25 ]
Out[110]:
name age city department salary years_experience is_full_time start_date
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
In [111]:
#.2 Then we sort
filtered_result.sort_values(by="age")
Out[111]:
name age city department salary years_experience is_full_time start_date
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
In [114]:
# sort_values is smart enough to also work with strings.
# Let's say we want to filter the cities alphabetically.
# In that case we can simply pass city in function
filtered_result.sort_values(by="city")
Out[114]:
name age city department salary years_experience is_full_time start_date
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
9 Julia 29 The Hague Marketing 3700 5 True 2018-08-09
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
In [115]:
# -------------------------
# Using isin() for multiple values
# -------------------------

# isin() is useful when you want to filter multiple values from one column.

# Example: Filter specific cities
# Note we're passing a list to the isin function -> ["Amsterdam", "Rotterdam"]

df[df["city"].isin(["Amsterdam", "Rotterdam"])]
Out[115]:
name age city department salary years_experience is_full_time start_date
0 Alice 25 Rotterdam Marketing 3200 2 True 2022-06-15
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
6 George 40 Amsterdam Management 6000 15 True 2010-07-30
In [117]:
# Example: Filter multiple departments

df[df["department"].isin(["Engineering", "Sales"])]
Out[117]:
name age city department salary years_experience is_full_time start_date
1 Bob 30 Amsterdam Engineering 4000 5 True 2020-03-10
2 Charlie 28 Utrecht Sales 3500 3 False 2021-09-01
4 Ethan 35 Eindhoven Engineering 4500 10 True 2015-11-05
5 Fiona 27 Rotterdam Sales 3600 4 False 2021-04-18
7 Hannah 24 Utrecht Engineering 3100 2 True 2022-10-12
8 Ivan 31 Eindhoven Sales 4200 6 False 2019-02-25
In [118]:
# -------------------------
# Grouping data (groupby)
# -------------------------

# groupby() is used to group data based on a column,
# and then perform calculations on each group.

# Basic idea:
# - Split data into groups
# - Apply a calculation
# - Combine the result
In [119]:
# Example 1: Average salary per department

df.groupby("department")["salary"].mean()
Out[119]:
department
Engineering    3866.666667
Management     6000.000000
Marketing      3233.333333
Sales          3766.666667
Name: salary, dtype: float64
In [ ]:
# Let's disect what happens here with our dataset
# The csv contains the following data:

# name,age,city,department,salary,years_experience,is_full_time,start_date
# Alice,25,Rotterdam,Marketing,3200,2,True,2022-06-15
# Bob,30,Amsterdam,Engineering,4000,5,True,2020-03-10
# Charlie,28,Utrecht,Sales,3500,3,False,2021-09-01
# Diana,22,The Hague,Marketing,2800,1,True,2023-01-20
# Ethan,35,Eindhoven,Engineering,4500,10,True,2015-11-05
# Fiona,27,Rotterdam,Sales,3600,4,False,2021-04-18
# George,40,Amsterdam,Management,6000,15,True,2010-07-30
# Hannah,24,Utrecht,Engineering,3100,2,True,2022-10-12
# Ivan,31,Eindhoven,Sales,4200,6,False,2019-02-25
# Julia,29,The Hague,Marketing,3700,5,True,2018-08-09

# We are grouping everyone in each existing department. So we have x groups:
# 1. Marketing
# 2. Engineering
# 3. Sales
# 4. Management

# Then we take the mean salary of the people in the individual departments
In [120]:
# Example 2: Count how many people are in each city

df.groupby("city")["name"].count()
Out[120]:
city
Amsterdam    2
Eindhoven    2
Rotterdam    2
The Hague    2
Utrecht      2
Name: name, dtype: int64
In [121]:
# Example 3: Average age per department

df.groupby("department")["age"].mean()
Out[121]:
department
Engineering    29.666667
Management     40.000000
Marketing      25.333333
Sales          28.666667
Name: age, dtype: float64
In [122]:
# Example 4: Multiple calculations at once

df.groupby("department")["salary"].agg(["mean", "min", "max"])
Out[122]:
mean min max
department
Engineering 3866.666667 3100 4500
Management 6000.000000 6000 6000
Marketing 3233.333333 2800 3700
Sales 3766.666667 3500 4200
In [123]:
# Example 5: Group by multiple columns

df.groupby(["department", "city"])["salary"].mean()
Out[123]:
department   city     
Engineering  Amsterdam    4000.0
             Eindhoven    4500.0
             Utrecht      3100.0
Management   Amsterdam    6000.0
Marketing    Rotterdam    3200.0
             The Hague    3250.0
Sales        Eindhoven    4200.0
             Rotterdam    3600.0
             Utrecht      3500.0
Name: salary, dtype: float64
In [ ]:
# Add column to dataframe
# we can add a new column to an existing dataframe by calling the dataframe with a new column name
# We can also make use 
df["yearly_salary"]=df["salary"]*12
df
In [ ]:
# Notes:
# - groupby() groups rows that have the same value in a column
# - After grouping, you usually apply a function like:
#   mean(), count(), min(), max()

# - You can select a column after groupby:
#   df.groupby("column")["another_column"].function()

# - Useful functions:
#   mean()  → average
#   sum()   → total
#   count() → number of rows
#   min()   → smallest value
#   max()   → largest value
In [177]:
# Multi index
# You can create a multi index by passing a list of indices to your set_index function
df_multi=df.set_index(["department", "city", "salary"])
df_multi
Out[177]:
name age years_experience is_full_time start_date
department city salary
Marketing Rotterdam 3200 Alice 25 2 True 2022-06-15
Engineering Amsterdam 4000 Bob 30 5 True 2020-03-10
Sales Utrecht 3500 Charlie 28 3 False 2021-09-01
Marketing The Hague 2800 Diana 22 1 True 2023-01-20
Engineering Eindhoven 4500 Ethan 35 10 True 2015-11-05
Sales Rotterdam 3600 Fiona 27 4 False 2021-04-18
Management Amsterdam 6000 George 40 15 True 2010-07-30
Engineering Utrecht 3100 Hannah 24 2 True 2022-10-12
Sales Eindhoven 4200 Ivan 31 6 False 2019-02-25
Marketing The Hague 3700 Julia 29 5 True 2018-08-09
In [189]:
# -------------------------
# Using .loc with a multi index
# -------------------------

# When using a multi index, you select rows by passing a tuple:
# (level_1, level_2, ...)

df_multi.loc[("Marketing", "The Hague"), :]

# This means:
# - department = "Marketing"
# - city = "The Hague"
# - ":" → all remaining levels and all columns

# Since we did not specify "salary" (3rd index level),
# pandas will return all rows that match:
# department = Marketing AND city = The Hague
C:\Users\romar\AppData\Local\Temp\ipykernel_29212\2697545327.py:8: PerformanceWarning: indexing past lexsort depth may impact performance.
  df_multi.loc[("Marketing", "The Hague"), :]
Out[189]:
name age years_experience is_full_time start_date
salary
2800 Diana 22 1 True 2023-01-20
3700 Julia 29 5 True 2018-08-09
In [188]:
# -------------------------
# Selecting multiple values in a multi index
# -------------------------

# You can also pass a list inside the tuple

# Example:
df_multi.loc[("Marketing", ["The Hague", "Rotterdam"]), :]

# This means:
# - department = "Marketing"
# - city = either "The Hague" OR "Rotterdam"
Out[188]:
name age years_experience is_full_time start_date
department city salary
Marketing The Hague 2800 Diana 22 1 True 2023-01-20
3700 Julia 29 5 True 2018-08-09
Rotterdam 3200 Alice 25 2 True 2022-06-15
In [194]:
# -------------------------
# Selecting multiple values in a multi index and filter out the colums
# -------------------------
df_multi.loc[("Marketing", ["The Hague", "Rotterdam"]), ["name","is_full_time"]]

# Explanation:

# ("Marketing", ["The Hague", "Rotterdam"]) → row selection
# - "Marketing" filters the first index level (department)
# - ["The Hague", "Rotterdam"] filters the second index level (city)
# - This returns rows where:
#   department = Marketing AND city is either The Hague or Rotterdam

# ["name"] → column selection
# - We only return the "name" column from the result

# Important:
# - The row part uses the multi index (department, city, salary)
# - The column part must use column names (like "name"), not numbers

# Result:
# A smaller table (DataFrame) with:
# - only selected rows
# - only the "name" column
Out[194]:
name is_full_time
department city salary
Marketing The Hague 2800 Diana True
3700 Julia True
Rotterdam 3200 Alice True
In [125]:
# -------------------------
# Series vs DataFrame
# -------------------------

# A Series and a DataFrame are both data structures in pandas,
# but they are used in slightly different situations.

# -------------------------
# Series
# -------------------------

# A Series is a single row OR a single column.

# Example:
df.loc[0]

# This returns one row as a Series.

# You can think of a Series as a 1D structure (a single line of data).
Out[125]:
name                     Alice
age                         25
city                 Rotterdam
department           Marketing
salary                    3200
years_experience             2
is_full_time              True
start_date          2022-06-15
Name: 0, dtype: object
In [127]:
# -------------------------
# DataFrame
# -------------------------

# A DataFrame is a table with rows and columns.

# Example:
df.loc[[0, 3]]

# This returns multiple rows as a DataFrame.

# You can think of a DataFrame as a 2D structure (a table).
Out[127]:
name age city department salary years_experience is_full_time start_date
0 Alice 25 Rotterdam Marketing 3200 2 True 2022-06-15
In [129]:
# -------------------------
# Simple rule
# -------------------------

# - One row → Series
# - Multiple rows → DataFrame

# Even a single row can be a DataFrame if you use:
df.loc[[0]]
<class 'pandas.core.series.Series'>
In [132]:
# You can verify by checking the type
print(type(df.loc[[0]]))
<class 'pandas.core.frame.DataFrame'>
In [135]:
# -------------------------
# Introduction to matplotlib
# -------------------------

# matplotlib is a library used to create visualizations (graphs)
# This helps to better understand and present data

# We usually use matplotlib together with pandas

import matplotlib.pyplot as plt
In [139]:
# -------------------------
# Basic plot
# -------------------------

# A basic plot shows values on a graph

# Example: Plot age values

df["age"].plot()

plt.show()

# What happens here:

# df["age"].plot()
# → pandas creates a simple line plot using the "age" column

# plt.show()
# → displays the plot
In [140]:
# -------------------------
# Scatter plot (correlation)
# -------------------------

# A scatter plot is used to show the relationship between two variables

# Example: years of experience vs salary

plt.scatter(df["years_experience"], df["salary"])

plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.title("Salary vs Years of Experience")

plt.show()
In [146]:
# For sake of reusability you can also assign variables to the data properties:
salary = df['salary']
years_experience = df['years_experience']
plt.scatter(years_experience, salary)
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.title("Salary vs Years of Experience")
Out[146]:
Text(0.5, 1.0, 'Salary vs Years of Experience')
In [147]:
# -------------------------
# Correlation (np.corrcoef)
# -------------------------

# Correlation measures how strongly two variables are related

# We use numpy for this
import numpy as np
In [148]:
# Example: correlation between years_experience and salary

years_experience = df["years_experience"]
salary = df["salary"]

np.corrcoef(years_experience, salary)
Out[148]:
array([[1.        , 0.98217683],
       [0.98217683, 1.        ]])
In [149]:
# Extracting the actual correlation value
corr = np.corrcoef(years_experience, salary)[0, 1]
print(corr)
0.9821768330299293
In [ ]:
# Correlation values range from -1 to 1

#  1   → strong positive correlation (goes up together)
#  0   → no correlation
# -1   → strong negative correlation (one goes up, other goes down)

# In our case the correlation is very close to 1 meaning that the more years of experience the employee has, the higher the salary
In [150]:
# -------------------------
# Histogram
# -------------------------

# A histogram shows how values are distributed (spread out)

# Example: distribution of salaries

plt.hist(df["salary"])

plt.show()
In [151]:
# -------------------------
# Number of bins
# -------------------------

# Bins are the "bars" in the histogram

# More bins → more detail
# Fewer bins → simpler view

# Example with 5 bins:
plt.hist(df["salary"], bins=5)

plt.show()
In [152]:
# -------------------------
# Labels and title
# -------------------------

plt.hist(df["salary"], bins=5)

plt.xlabel("Salary")
plt.ylabel("Number of Employees")
plt.title("Distribution of Salaries")

plt.show()
In [156]:
# -------------------------
# Tick values and labels
# -------------------------

# tick_val = positions on the axis
# tick_lab = labels shown at those positions

# Example:
tick_val = [2000, 3000, 4000, 5000, 6000]
tick_lab = ["2k", "3k", "4k", "5k", "6k"]


# -------------------------
# Apply custom ticks on x-axis
# -------------------------

plt.hist(df["salary"], bins=5)

plt.xlabel("Salary")
plt.ylabel("Number of Employees")
plt.title("Distribution of Salaries")

plt.xticks(tick_val, tick_lab)

plt.show()
In [237]:
# Calculating mean, median, sum etc in a histogram

mean_salary = df['salary'].mean()
sum_salary=df['salary'].sum()
median=df['salary'].median()
print(mean_salary)
print(sum_salary)
print(median)
3860.0
38600
3650.0
In [230]:
# Convert date
# In our data the start_date shows a date but is actually of type string.
df=pd.read_csv("data.csv")
df["start_date"].apply(type).unique()
Out[230]:
array([<class 'str'>], dtype=object)
In [224]:
df["start_date"] = pd.to_datetime(df["start_date"])
In [225]:
# -------------------------
# Convert start_date to datetime
# -------------------------

# Convert the column from string → datetime
df["start_date"] = pd.to_datetime(df["start_date"])

# Check the result
print(df["start_date"].dtype)

# After conversion, the dtype should be:
# datetime64[ns]

# This means pandas now treats the values as real dates,
# not as plain text (strings)
datetime64[ns]
In [232]:
# change date time format 

# -------------------------
# Changing datetime format
# -------------------------

# First, convert the column to datetime
df["start_date"] = pd.to_datetime(df["start_date"])


# Then, change how the date is displayed using strftime
df['start_date'] = df['start_date'].dt.strftime('%d-%m-%Y')

# Notes:

# - pd.to_datetime() converts the column into a real datetime type
# - .dt.strftime() changes the DISPLAY format of the date

# Format codes:
# %d → day (01–31)
# %m → month (01–12)
# %Y → full year (e.g. 2022)

# Important:
# After using strftime, the column becomes a string again
# (not a datetime object anymore)

# So:
# - Use datetime format for calculations and filtering
# - Use strftime only when you want to display dates in a specific format
df
Out[232]:
name age city department salary years_experience is_full_time start_date
0 Alice 25 Rotterdam Marketing 3200 2 True 15-06-2022
1 Bob 30 Amsterdam Engineering 4000 5 True 10-03-2020
2 Charlie 28 Utrecht Sales 3500 3 False 01-09-2021
3 Diana 22 The Hague Marketing 2800 1 True 20-01-2023
4 Ethan 35 Eindhoven Engineering 4500 10 True 05-11-2015
5 Fiona 27 Rotterdam Sales 3600 4 False 18-04-2021
6 George 40 Amsterdam Management 6000 15 True 30-07-2010
7 Hannah 24 Utrecht Engineering 3100 2 True 12-10-2022
8 Ivan 31 Eindhoven Sales 4200 6 False 25-02-2019
9 Julia 29 The Hague Marketing 3700 5 True 09-08-2018
In [ ]:
# -------------------------
# Example: working with dates
# -------------------------

# Extracting year
df["start_year"] = df["start_date"].dt.year

# Extracting the month
df["start_month"] = df["start_date"].dt.month

# # Extracting the day
df["start_day"] = df["start_date"].dt.day


df
# We should now have 3 more columns. start_year, start_month,start_day
In [233]:
# Create a csv
# This will create a csv file
df.to_csv('dataframe.csv')
In [3]:
# -------------------------
# Calculating the average from a mixed list
# -------------------------

# This list contains both names (strings) and numbers (ages)
data = ['Anna', 21, 'John', 30, 'Maria', 25, 'David', 28, 'Lisa', 22]

# Variable to store the total sum of numbers
total = 0

# Variable to count how many numbers we have
count = 0

# Loop through each item in the list
for item in data:

    # Check if the item is a number (int or float)
    if type(item) == int or type(item) == float:
        total += item
        count += 1

# Calculate the average
average = total / count

print(average)

# What is happening step by step:

# 1. We go through each item in the list
# 2. We check: is this item a number?
# 3. If yes:
#    - add it to total
#    - increase count
# 4. After the loop:
#    - divide total by count → average

# The list contains strings AND numbers

# If we try:
# total += "Anna"
# → this would cause an error

# So we must check the type first
25.2
In [ ]: