# =========================
# Importing a CSV (PANDAS)
# =========================
# To work with CSV files in Python, we use the pandas library
import pandas as pd
# -------------------------
# Basic CSV import
# -------------------------
# The most common way to import a CSV is using:
df = pd.read_csv("data.csv")
# This reads the file and stores it in a DataFrame (df)
# A DataFrame is like a table (rows and columns)
# -------------------------
# Viewing the data
# -------------------------
# Show the first 5 rows
df.head()
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | Alice | 25 | Rotterdam | Marketing | 3200 | 2 | True | 2022-06-15 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
| 3 | Diana | 22 | The Hague | Marketing | 2800 | 1 | True | 2023-01-20 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
# Show basic information about the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10 entries, 0 to 9 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 10 non-null object 1 age 10 non-null int64 2 city 10 non-null object 3 department 10 non-null object 4 salary 10 non-null int64 5 years_experience 10 non-null int64 6 is_full_time 10 non-null bool 7 start_date 10 non-null object dtypes: bool(1), int64(3), object(4) memory usage: 698.0+ bytes
# Show summary statistics (only for numeric columns)
df.describe()
| age | salary | years_experience | |
|---|---|---|---|
| count | 10.0000 | 10.00000 | 10.000000 |
| mean | 29.1000 | 3860.00000 | 5.300000 |
| std | 5.3427 | 911.89668 | 4.270051 |
| min | 22.0000 | 2800.00000 | 1.000000 |
| 25% | 25.5000 | 3275.00000 | 2.250000 |
| 50% | 28.5000 | 3650.00000 | 4.500000 |
| 75% | 30.7500 | 4150.00000 | 5.750000 |
| max | 40.0000 | 6000.00000 | 15.000000 |
# -------------------------
# Different separators
# -------------------------
# By default, pandas assumes the file is separated by commas (,)
# But sometimes other separators are used
# Example CSV with semicolon (common in Europe):
# name;age;city;salary
# Alice;25;Rotterdam;3200
# In that case, you must specify the separator:
df = pd.read_csv("data.csv", sep=",")
# -------------------------
# Common issues
# -------------------------
# Problem: Wrong separator
# If your columns look wrong (everything in one column),
# you probably need to change 'sep'
# Problem: Encoding errors
# Sometimes CSV files use different text encodings
df = pd.read_csv("data.csv", encoding="utf-8")
# or:
df = pd.read_csv("data.csv", encoding="latin-1")
# -------------------------
# Printing a column (Series)
# -------------------------
# You can access a column by using its name:
city = df["city"]
print(city)
print("====================================================================================")
# Output will look like:
# 0 Rotterdam
# 1 Amsterdam
# 2 Utrecht
# ...
# The numbers on the left (0, 1, 2, ...) are the index.
# What is the index?
# - It is a number automatically added by pandas
# - It shows the position of each row in the table
# - It is NOT part of the actual data (like city or name)
# Think of it like a row number in a table:
# Row 0 = first row
# Row 1 = second row
# etc.
# You can use the index to get a specific value:
print(city[0]) # Rotterdam
0 Rotterdam 1 Amsterdam 2 Utrecht 3 The Hague 4 Eindhoven 5 Rotterdam 6 Amsterdam 7 Utrecht 8 Eindhoven 9 The Hague Name: city, dtype: object ==================================================================================== Rotterdam
# We can verify the type by printing the type
print(type(city))
<class 'pandas.core.series.Series'>
# -------------------------
# Setting a custom index
# -------------------------
# By default, pandas uses a numeric index (0, 1, 2, ...).
# But you can choose one of the columns to be the index instead.
# Example: using the "name" column as the index
df_indexed = df.set_index("name")
print(df_indexed)
# Now the index will be:
# Alice, Bob, Charlie, etc.
# instead of numbers like 0, 1, 2
age city department salary years_experience is_full_time \
name
Alice 25 Rotterdam Marketing 3200 2 True
Bob 30 Amsterdam Engineering 4000 5 True
Charlie 28 Utrecht Sales 3500 3 False
Diana 22 The Hague Marketing 2800 1 True
Ethan 35 Eindhoven Engineering 4500 10 True
Fiona 27 Rotterdam Sales 3600 4 False
George 40 Amsterdam Management 6000 15 True
Hannah 24 Utrecht Engineering 3100 2 True
Ivan 31 Eindhoven Sales 4200 6 False
Julia 29 The Hague Marketing 3700 5 True
start_date
name
Alice 2022-06-15
Bob 2020-03-10
Charlie 2021-09-01
Diana 2023-01-20
Ethan 2015-11-05
Fiona 2021-04-18
George 2010-07-30
Hannah 2022-10-12
Ivan 2019-02-25
Julia 2018-08-09
# -------------------------
# Why use a custom index?
# -------------------------
# Using a meaningful column as the index can make data easier to work with.
# For example:
# - Each person has a unique name in this dataset
# - So we can quickly look up a person by name
# Example:
df_indexed.loc["Alice"]
# This returns all information about Alice in one row.
# -------------------------
# When is this useful?
# -------------------------
# A custom index is useful when:
# - The column has unique values (no duplicates)
# - You want to quickly look up rows by a meaningful label
# - The index represents something "real" (like name, ID, date, etc.)
# In this dataset, "name" works well because:
# - Each row represents one person
# - Names are easy to understand and use for lookup
age 25 city Rotterdam department Marketing salary 3200 years_experience 2 is_full_time True start_date 2022-06-15 Name: Alice, dtype: object
# -------------------------
# We can further drill into Alice's information.
# -------------------------
# Let's say we want to find her age. It would be as simple as accessing her information as we just did and then specifying which column we want to display:
df_indexed.loc["Alice", "age"]
# This works because:
# - "Alice" is the index (row label)
# - "age" is the column name
# We can also access multiple columns for the same person:
df_indexed.loc["Alice", ["age", "city", "salary"]]
# This will return a small table with only the selected columns.
age 25 city Rotterdam salary 3200 Name: Alice, dtype: object
# -------------------------
# Loc, column location
# -------------------------
# .loc is used to select data from a DataFrame using labels (names).
# The general format is:
# df.loc[row_label, column_label]
# In this example:
df_indexed.loc["Alice", "age"]
# "Alice" -> row label (index value)
# "age" -> column name
25
# So .loc means:
# Go to the row with label 'Alice' and return the value in the 'age' column
# -------------------------
# Simple way to think about it
# -------------------------
# .loc[row, column]
# Row = which person (or entry) you want
# Column = which piece of information you want
# -------------------------
# Example
# -------------------------
# Get Alice's city:
df_indexed.loc["Alice", "city"]
'Rotterdam'
# Get multiple columns for Alice:
df_indexed.loc["Alice", ["age", "city", "salary"]]
age 25 city Rotterdam salary 3200 Name: Alice, dtype: object
# IMPORTANT: .loc uses labels (column names), not positions (numbers)
# This will NOT work because 0:2 are positions, not column names:
df_indexed.loc["Alice", 0:2]
# If you want to select columns by position, you should use .iloc instead
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_29212\2695698881.py in <module> 2 3 # This will NOT work because 0:2 are positions, not column names: ----> 4 df_indexed.loc["Alice", 0:2] 5 6 # If you want to select columns by position, you should use .iloc instead ~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key) 959 if self._is_scalar_access(key): 960 return self.obj._get_value(*key, takeable=self._takeable) --> 961 return self._getitem_tuple(key) 962 else: 963 # we by definition only have the 0th axis ~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup) 1138 with suppress(IndexingError): 1139 tup = self._expand_ellipsis(tup) -> 1140 return self._getitem_lowerdim(tup) 1141 1142 # no multi-index, so validate all of the indexers ~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup) 889 return section 890 # This is an elided recursive call to iloc/loc --> 891 return getattr(section, self.name)[new_key] 892 893 raise IndexingError("not applicable") ~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key) 965 966 maybe_callable = com.apply_if_callable(key, self.obj) --> 967 return self._getitem_axis(maybe_callable, axis=axis) 968 969 def _is_scalar_access(self, key: tuple): ~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis) 1181 if isinstance(key, slice): 1182 self._validate_key(key, axis) -> 1183 return self._get_slice_axis(key, axis=axis) 1184 elif com.is_bool_indexer(key): 1185 return self._getbool_axis(key, axis=axis) ~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_slice_axis(self, slice_obj, axis) 1215 1216 labels = obj._get_axis(axis) -> 1217 indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step) 1218 1219 if isinstance(indexer, slice): ~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in slice_indexer(self, start, end, step, kind) 6286 self._deprecated_arg(kind, "kind", "slice_indexer") 6287 -> 6288 start_slice, end_slice = self.slice_locs(start, end, step=step) 6289 6290 # return a slice ~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in slice_locs(self, start, end, step, kind) 6496 start_slice = None 6497 if start is not None: -> 6498 start_slice = self.get_slice_bound(start, "left") 6499 if start_slice is None: 6500 start_slice = 0 ~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_slice_bound(self, label, side, kind) 6405 # For datetime indices label may be a string that has to be converted 6406 # to datetime boundary according to its resolution. -> 6407 label = self._maybe_cast_slice_bound(label, side) 6408 6409 # we need to look up the label ~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _maybe_cast_slice_bound(self, label, side, kind) 6352 # reject them, if index does not contain label 6353 if (is_float(label) or is_integer(label)) and label not in self: -> 6354 raise self._invalid_indexer("slice", label) 6355 6356 return label TypeError: cannot do slice indexing on Index with these indexers [0] of type int
# -------------------------
# iloc (index location)
# -------------------------
# .iloc is used to select data based on POSITION (numbers), not names.
# The general format is:
# df.iloc[row_position, column_position]
# -------------------------
# Example
# -------------------------
# Get Alice's row using position:
# This returns the first row in the dataset (Alice)
# Keep in mind that df_indexed is indexed with Name as the index and Alice is the first name in the list
df_indexed.iloc[0]
age 25 city Rotterdam department Marketing salary 3200 years_experience 2 is_full_time True start_date 2022-06-15 Name: Alice, dtype: object
# Get a specific value using row and column positions:
df_indexed.iloc[0, 1]
# This means:
# - Row at position 0 (first row → Alice)
# - Column at position 1 (second column) this would be city in our dataset
'Rotterdam'
# -------------------------
# Selecting a range of columns
# -------------------------
# Get the first row, and columns 0 to 2:
# Note:
# - 0:3 means columns at positions 0, 1, and 2
# - The end number (3) is NOT included
# -------------------------
# Simple rule
# -------------------------
# .loc → uses names (labels)
# .iloc → uses positions (numbers)
# Use .iloc when you want to work with column/row positions
# instead of column names
# How to select multiple columns with .iloc
# You need to pass a list of column positions using []:
df_indexed.iloc[0, [0,3,5]]
age 25 salary 3200 is_full_time True Name: Alice, dtype: object
# Similarly you can also select multiple rows
df_indexed.iloc[0:2, [0,3,5]]
| age | salary | is_full_time | |
|---|---|---|---|
| name | |||
| Alice | 25 | 3200 | True |
| Bob | 30 | 4000 | True |
# or
df_indexed.iloc[:4, [0,3,5]]
# What does ":" mean in this context?
# ":" means "a range" or "from start to end".
# Example:
# 0:2 means:
# - start at index 0
# - go up to (but not including) index 2
# - rows at positions 0 and 1
# :4 means:
# - start from the beginning
# - go up to (but not including) index 4
# - rows at positions 0, 1, 2, 3
# So:
# df_indexed.iloc[:4, [0,3,5]]
# means:
# - take the first 4 rows
# - and only columns at positions 0, 3, and 5
| age | salary | is_full_time | |
|---|---|---|---|
| name | |||
| Alice | 25 | 3200 | True |
| Bob | 30 | 4000 | True |
| Charlie | 28 | 3500 | False |
| Diana | 22 | 2800 | True |
# Using negative indices (sometimes useful)
df_indexed.iloc[-1]
# This gets the last row in the dataset
age 29 city The Hague department Marketing salary 3700 years_experience 5 is_full_time True start_date 2018-08-09 Name: Julia, dtype: object
# In short. Iloc takes in 2 arguments. The row position and colum position:
# df.iloc[row_position, column_position]
# -------------------------
# Filtering data
# -------------------------
# Filtering means selecting rows based on a condition.
# The basic idea is:
# df[condition]
# The condition returns True or False for each row.
# Only rows with True are kept.
# Example 1: Filter people older than 30
df[df["age"] > 30]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
# Example 1: Filter people older than 30
df[df["age"] > 30]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
# Example 2: Filter by department
df[df["department"] == "Engineering"]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 7 | Hannah | 24 | Utrecht | Engineering | 3100 | 2 | True | 2022-10-12 |
# Example 3: Filter by multiple conditions
# Filter out everyone that is older than 25 and has a salary higher than 3500
df[(df["age"] > 25) & (df["salary"] > 3500)]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
# Example 4: Full-time employees only
df[df["is_full_time"] == True]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | Alice | 25 | Rotterdam | Marketing | 3200 | 2 | True | 2022-06-15 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 3 | Diana | 22 | The Hague | Marketing | 2800 | 1 | True | 2023-01-20 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
| 7 | Hannah | 24 | Utrecht | Engineering | 3100 | 2 | True | 2022-10-12 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
# Example 5: Filter by city
df[df["city"] == "Amsterdam"]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
# Notes:
# - Use () around each condition when combining them
# - Use:
# & for AND
# | for OR
# - Do NOT use "and" / "or" with pandas conditions
# - Always write conditions inside df[...]
# -------------------------
# Sorting data
# -------------------------
# You can sort a DataFrame using the sort_values() function.
# The basic format:
# df.sort_values(by="column_name")
# Example 1: Sort by age (ascending)
df.sort_values(by="age")
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 3 | Diana | 22 | The Hague | Marketing | 2800 | 1 | True | 2023-01-20 |
| 7 | Hannah | 24 | Utrecht | Engineering | 3100 | 2 | True | 2022-10-12 |
| 0 | Alice | 25 | Rotterdam | Marketing | 3200 | 2 | True | 2022-06-15 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
# Example 2: Sort by salary (highest first)
df.sort_values(by="salary", ascending=False)
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
| 0 | Alice | 25 | Rotterdam | Marketing | 3200 | 2 | True | 2022-06-15 |
| 7 | Hannah | 24 | Utrecht | Engineering | 3100 | 2 | True | 2022-10-12 |
| 3 | Diana | 22 | The Hague | Marketing | 2800 | 1 | True | 2023-01-20 |
# Example 3: Sort by multiple columns
df.sort_values(by=["department", "salary"])
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 7 | Hannah | 24 | Utrecht | Engineering | 3100 | 2 | True | 2022-10-12 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
| 3 | Diana | 22 | The Hague | Marketing | 2800 | 1 | True | 2023-01-20 |
| 0 | Alice | 25 | Rotterdam | Marketing | 3200 | 2 | True | 2022-06-15 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
# -------------------------
# Combining filtering and sorting
# -------------------------
# You can first filter data, and then sort the result.
# Example: Show Engineering employees sorted by salary (highest first)
df[df["department"] == "Engineering"].sort_values(by="salary", ascending=False)
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 7 | Hannah | 24 | Utrecht | Engineering | 3100 | 2 | True | 2022-10-12 |
# Example: Employees older than 25, sorted by age
# Note that we call a function on top of our initial filtering
df[df["age"] > 25].sort_values(by="age")
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
# The code can be split into 2.
# 1.First the filtering:
filtered_result = df[df["age"] > 25 ]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
#.2 Then we sort
filtered_result.sort_values(by="age")
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
# sort_values is smart enough to also work with strings.
# Let's say we want to filter the cities alphabetically.
# In that case we can simply pass city in function
filtered_result.sort_values(by="city")
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 2018-08-09 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
# -------------------------
# Using isin() for multiple values
# -------------------------
# isin() is useful when you want to filter multiple values from one column.
# Example: Filter specific cities
# Note we're passing a list to the isin function -> ["Amsterdam", "Rotterdam"]
df[df["city"].isin(["Amsterdam", "Rotterdam"])]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | Alice | 25 | Rotterdam | Marketing | 3200 | 2 | True | 2022-06-15 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 2010-07-30 |
# Example: Filter multiple departments
df[df["department"].isin(["Engineering", "Sales"])]
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 2020-03-10 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 2021-09-01 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 2015-11-05 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 2021-04-18 |
| 7 | Hannah | 24 | Utrecht | Engineering | 3100 | 2 | True | 2022-10-12 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 2019-02-25 |
# -------------------------
# Grouping data (groupby)
# -------------------------
# groupby() is used to group data based on a column,
# and then perform calculations on each group.
# Basic idea:
# - Split data into groups
# - Apply a calculation
# - Combine the result
# Example 1: Average salary per department
df.groupby("department")["salary"].mean()
department Engineering 3866.666667 Management 6000.000000 Marketing 3233.333333 Sales 3766.666667 Name: salary, dtype: float64
# Let's disect what happens here with our dataset
# The csv contains the following data:
# name,age,city,department,salary,years_experience,is_full_time,start_date
# Alice,25,Rotterdam,Marketing,3200,2,True,2022-06-15
# Bob,30,Amsterdam,Engineering,4000,5,True,2020-03-10
# Charlie,28,Utrecht,Sales,3500,3,False,2021-09-01
# Diana,22,The Hague,Marketing,2800,1,True,2023-01-20
# Ethan,35,Eindhoven,Engineering,4500,10,True,2015-11-05
# Fiona,27,Rotterdam,Sales,3600,4,False,2021-04-18
# George,40,Amsterdam,Management,6000,15,True,2010-07-30
# Hannah,24,Utrecht,Engineering,3100,2,True,2022-10-12
# Ivan,31,Eindhoven,Sales,4200,6,False,2019-02-25
# Julia,29,The Hague,Marketing,3700,5,True,2018-08-09
# We are grouping everyone in each existing department. So we have x groups:
# 1. Marketing
# 2. Engineering
# 3. Sales
# 4. Management
# Then we take the mean salary of the people in the individual departments
# Example 2: Count how many people are in each city
df.groupby("city")["name"].count()
city Amsterdam 2 Eindhoven 2 Rotterdam 2 The Hague 2 Utrecht 2 Name: name, dtype: int64
# Example 3: Average age per department
df.groupby("department")["age"].mean()
department Engineering 29.666667 Management 40.000000 Marketing 25.333333 Sales 28.666667 Name: age, dtype: float64
# Example 4: Multiple calculations at once
df.groupby("department")["salary"].agg(["mean", "min", "max"])
| mean | min | max | |
|---|---|---|---|
| department | |||
| Engineering | 3866.666667 | 3100 | 4500 |
| Management | 6000.000000 | 6000 | 6000 |
| Marketing | 3233.333333 | 2800 | 3700 |
| Sales | 3766.666667 | 3500 | 4200 |
# Example 5: Group by multiple columns
df.groupby(["department", "city"])["salary"].mean()
department city
Engineering Amsterdam 4000.0
Eindhoven 4500.0
Utrecht 3100.0
Management Amsterdam 6000.0
Marketing Rotterdam 3200.0
The Hague 3250.0
Sales Eindhoven 4200.0
Rotterdam 3600.0
Utrecht 3500.0
Name: salary, dtype: float64
# Add column to dataframe
# we can add a new column to an existing dataframe by calling the dataframe with a new column name
# We can also make use
df["yearly_salary"]=df["salary"]*12
df
# Notes:
# - groupby() groups rows that have the same value in a column
# - After grouping, you usually apply a function like:
# mean(), count(), min(), max()
# - You can select a column after groupby:
# df.groupby("column")["another_column"].function()
# - Useful functions:
# mean() → average
# sum() → total
# count() → number of rows
# min() → smallest value
# max() → largest value
# Multi index
# You can create a multi index by passing a list of indices to your set_index function
df_multi=df.set_index(["department", "city", "salary"])
df_multi
| name | age | years_experience | is_full_time | start_date | |||
|---|---|---|---|---|---|---|---|
| department | city | salary | |||||
| Marketing | Rotterdam | 3200 | Alice | 25 | 2 | True | 2022-06-15 |
| Engineering | Amsterdam | 4000 | Bob | 30 | 5 | True | 2020-03-10 |
| Sales | Utrecht | 3500 | Charlie | 28 | 3 | False | 2021-09-01 |
| Marketing | The Hague | 2800 | Diana | 22 | 1 | True | 2023-01-20 |
| Engineering | Eindhoven | 4500 | Ethan | 35 | 10 | True | 2015-11-05 |
| Sales | Rotterdam | 3600 | Fiona | 27 | 4 | False | 2021-04-18 |
| Management | Amsterdam | 6000 | George | 40 | 15 | True | 2010-07-30 |
| Engineering | Utrecht | 3100 | Hannah | 24 | 2 | True | 2022-10-12 |
| Sales | Eindhoven | 4200 | Ivan | 31 | 6 | False | 2019-02-25 |
| Marketing | The Hague | 3700 | Julia | 29 | 5 | True | 2018-08-09 |
# -------------------------
# Using .loc with a multi index
# -------------------------
# When using a multi index, you select rows by passing a tuple:
# (level_1, level_2, ...)
df_multi.loc[("Marketing", "The Hague"), :]
# This means:
# - department = "Marketing"
# - city = "The Hague"
# - ":" → all remaining levels and all columns
# Since we did not specify "salary" (3rd index level),
# pandas will return all rows that match:
# department = Marketing AND city = The Hague
C:\Users\romar\AppData\Local\Temp\ipykernel_29212\2697545327.py:8: PerformanceWarning: indexing past lexsort depth may impact performance.
df_multi.loc[("Marketing", "The Hague"), :]
| name | age | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|
| salary | |||||
| 2800 | Diana | 22 | 1 | True | 2023-01-20 |
| 3700 | Julia | 29 | 5 | True | 2018-08-09 |
# -------------------------
# Selecting multiple values in a multi index
# -------------------------
# You can also pass a list inside the tuple
# Example:
df_multi.loc[("Marketing", ["The Hague", "Rotterdam"]), :]
# This means:
# - department = "Marketing"
# - city = either "The Hague" OR "Rotterdam"
| name | age | years_experience | is_full_time | start_date | |||
|---|---|---|---|---|---|---|---|
| department | city | salary | |||||
| Marketing | The Hague | 2800 | Diana | 22 | 1 | True | 2023-01-20 |
| 3700 | Julia | 29 | 5 | True | 2018-08-09 | ||
| Rotterdam | 3200 | Alice | 25 | 2 | True | 2022-06-15 |
# -------------------------
# Selecting multiple values in a multi index and filter out the colums
# -------------------------
df_multi.loc[("Marketing", ["The Hague", "Rotterdam"]), ["name","is_full_time"]]
# Explanation:
# ("Marketing", ["The Hague", "Rotterdam"]) → row selection
# - "Marketing" filters the first index level (department)
# - ["The Hague", "Rotterdam"] filters the second index level (city)
# - This returns rows where:
# department = Marketing AND city is either The Hague or Rotterdam
# ["name"] → column selection
# - We only return the "name" column from the result
# Important:
# - The row part uses the multi index (department, city, salary)
# - The column part must use column names (like "name"), not numbers
# Result:
# A smaller table (DataFrame) with:
# - only selected rows
# - only the "name" column
| name | is_full_time | |||
|---|---|---|---|---|
| department | city | salary | ||
| Marketing | The Hague | 2800 | Diana | True |
| 3700 | Julia | True | ||
| Rotterdam | 3200 | Alice | True |
# -------------------------
# Series vs DataFrame
# -------------------------
# A Series and a DataFrame are both data structures in pandas,
# but they are used in slightly different situations.
# -------------------------
# Series
# -------------------------
# A Series is a single row OR a single column.
# Example:
df.loc[0]
# This returns one row as a Series.
# You can think of a Series as a 1D structure (a single line of data).
name Alice age 25 city Rotterdam department Marketing salary 3200 years_experience 2 is_full_time True start_date 2022-06-15 Name: 0, dtype: object
# -------------------------
# DataFrame
# -------------------------
# A DataFrame is a table with rows and columns.
# Example:
df.loc[[0, 3]]
# This returns multiple rows as a DataFrame.
# You can think of a DataFrame as a 2D structure (a table).
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | Alice | 25 | Rotterdam | Marketing | 3200 | 2 | True | 2022-06-15 |
# -------------------------
# Simple rule
# -------------------------
# - One row → Series
# - Multiple rows → DataFrame
# Even a single row can be a DataFrame if you use:
df.loc[[0]]
<class 'pandas.core.series.Series'>
# You can verify by checking the type
print(type(df.loc[[0]]))
<class 'pandas.core.frame.DataFrame'>
# -------------------------
# Introduction to matplotlib
# -------------------------
# matplotlib is a library used to create visualizations (graphs)
# This helps to better understand and present data
# We usually use matplotlib together with pandas
import matplotlib.pyplot as plt
# -------------------------
# Basic plot
# -------------------------
# A basic plot shows values on a graph
# Example: Plot age values
df["age"].plot()
plt.show()
# What happens here:
# df["age"].plot()
# → pandas creates a simple line plot using the "age" column
# plt.show()
# → displays the plot
# -------------------------
# Scatter plot (correlation)
# -------------------------
# A scatter plot is used to show the relationship between two variables
# Example: years of experience vs salary
plt.scatter(df["years_experience"], df["salary"])
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.title("Salary vs Years of Experience")
plt.show()
# For sake of reusability you can also assign variables to the data properties:
salary = df['salary']
years_experience = df['years_experience']
plt.scatter(years_experience, salary)
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.title("Salary vs Years of Experience")
Text(0.5, 1.0, 'Salary vs Years of Experience')
# -------------------------
# Correlation (np.corrcoef)
# -------------------------
# Correlation measures how strongly two variables are related
# We use numpy for this
import numpy as np
# Example: correlation between years_experience and salary
years_experience = df["years_experience"]
salary = df["salary"]
np.corrcoef(years_experience, salary)
array([[1. , 0.98217683],
[0.98217683, 1. ]])
# Extracting the actual correlation value
corr = np.corrcoef(years_experience, salary)[0, 1]
print(corr)
0.9821768330299293
# Correlation values range from -1 to 1
# 1 → strong positive correlation (goes up together)
# 0 → no correlation
# -1 → strong negative correlation (one goes up, other goes down)
# In our case the correlation is very close to 1 meaning that the more years of experience the employee has, the higher the salary
# -------------------------
# Histogram
# -------------------------
# A histogram shows how values are distributed (spread out)
# Example: distribution of salaries
plt.hist(df["salary"])
plt.show()
# -------------------------
# Number of bins
# -------------------------
# Bins are the "bars" in the histogram
# More bins → more detail
# Fewer bins → simpler view
# Example with 5 bins:
plt.hist(df["salary"], bins=5)
plt.show()
# -------------------------
# Labels and title
# -------------------------
plt.hist(df["salary"], bins=5)
plt.xlabel("Salary")
plt.ylabel("Number of Employees")
plt.title("Distribution of Salaries")
plt.show()
# -------------------------
# Tick values and labels
# -------------------------
# tick_val = positions on the axis
# tick_lab = labels shown at those positions
# Example:
tick_val = [2000, 3000, 4000, 5000, 6000]
tick_lab = ["2k", "3k", "4k", "5k", "6k"]
# -------------------------
# Apply custom ticks on x-axis
# -------------------------
plt.hist(df["salary"], bins=5)
plt.xlabel("Salary")
plt.ylabel("Number of Employees")
plt.title("Distribution of Salaries")
plt.xticks(tick_val, tick_lab)
plt.show()
# Calculating mean, median, sum etc in a histogram
mean_salary = df['salary'].mean()
sum_salary=df['salary'].sum()
median=df['salary'].median()
print(mean_salary)
print(sum_salary)
print(median)
3860.0 38600 3650.0
# Convert date
# In our data the start_date shows a date but is actually of type string.
df=pd.read_csv("data.csv")
df["start_date"].apply(type).unique()
array([<class 'str'>], dtype=object)
df["start_date"] = pd.to_datetime(df["start_date"])
# -------------------------
# Convert start_date to datetime
# -------------------------
# Convert the column from string → datetime
df["start_date"] = pd.to_datetime(df["start_date"])
# Check the result
print(df["start_date"].dtype)
# After conversion, the dtype should be:
# datetime64[ns]
# This means pandas now treats the values as real dates,
# not as plain text (strings)
datetime64[ns]
# change date time format
# -------------------------
# Changing datetime format
# -------------------------
# First, convert the column to datetime
df["start_date"] = pd.to_datetime(df["start_date"])
# Then, change how the date is displayed using strftime
df['start_date'] = df['start_date'].dt.strftime('%d-%m-%Y')
# Notes:
# - pd.to_datetime() converts the column into a real datetime type
# - .dt.strftime() changes the DISPLAY format of the date
# Format codes:
# %d → day (01–31)
# %m → month (01–12)
# %Y → full year (e.g. 2022)
# Important:
# After using strftime, the column becomes a string again
# (not a datetime object anymore)
# So:
# - Use datetime format for calculations and filtering
# - Use strftime only when you want to display dates in a specific format
df
| name | age | city | department | salary | years_experience | is_full_time | start_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | Alice | 25 | Rotterdam | Marketing | 3200 | 2 | True | 15-06-2022 |
| 1 | Bob | 30 | Amsterdam | Engineering | 4000 | 5 | True | 10-03-2020 |
| 2 | Charlie | 28 | Utrecht | Sales | 3500 | 3 | False | 01-09-2021 |
| 3 | Diana | 22 | The Hague | Marketing | 2800 | 1 | True | 20-01-2023 |
| 4 | Ethan | 35 | Eindhoven | Engineering | 4500 | 10 | True | 05-11-2015 |
| 5 | Fiona | 27 | Rotterdam | Sales | 3600 | 4 | False | 18-04-2021 |
| 6 | George | 40 | Amsterdam | Management | 6000 | 15 | True | 30-07-2010 |
| 7 | Hannah | 24 | Utrecht | Engineering | 3100 | 2 | True | 12-10-2022 |
| 8 | Ivan | 31 | Eindhoven | Sales | 4200 | 6 | False | 25-02-2019 |
| 9 | Julia | 29 | The Hague | Marketing | 3700 | 5 | True | 09-08-2018 |
# -------------------------
# Example: working with dates
# -------------------------
# Extracting year
df["start_year"] = df["start_date"].dt.year
# Extracting the month
df["start_month"] = df["start_date"].dt.month
# # Extracting the day
df["start_day"] = df["start_date"].dt.day
df
# We should now have 3 more columns. start_year, start_month,start_day
# Create a csv
# This will create a csv file
df.to_csv('dataframe.csv')
# -------------------------
# Calculating the average from a mixed list
# -------------------------
# This list contains both names (strings) and numbers (ages)
data = ['Anna', 21, 'John', 30, 'Maria', 25, 'David', 28, 'Lisa', 22]
# Variable to store the total sum of numbers
total = 0
# Variable to count how many numbers we have
count = 0
# Loop through each item in the list
for item in data:
# Check if the item is a number (int or float)
if type(item) == int or type(item) == float:
total += item
count += 1
# Calculate the average
average = total / count
print(average)
# What is happening step by step:
# 1. We go through each item in the list
# 2. We check: is this item a number?
# 3. If yes:
# - add it to total
# - increase count
# 4. After the loop:
# - divide total by count → average
# The list contains strings AND numbers
# If we try:
# total += "Anna"
# → this would cause an error
# So we must check the type first
25.2