# =========================
# Importing a CSV (PANDAS)
# =========================

# To work with CSV files in Python, we use the pandas library
import pandas as pd
# -------------------------
# Basic CSV import
# -------------------------

# The most common way to import a CSV is using:
df = pd.read_csv("data.csv")

# This reads the file and stores it in a DataFrame (df)
# A DataFrame is like a table (rows and columns)


# -------------------------
# Viewing the data
# -------------------------

# Show the first 5 rows
df.head()


# Show basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              10 non-null     object
 1   age               10 non-null     int64 
 2   city              10 non-null     object
 3   department        10 non-null     object
 4   salary            10 non-null     int64 
 5   years_experience  10 non-null     int64 
 6   is_full_time      10 non-null     bool  
 7   start_date        10 non-null     object
dtypes: bool(1), int64(3), object(4)
memory usage: 698.0+ bytes


# Show summary statistics (only for numeric columns)
df.describe()


# -------------------------
# Different separators
# -------------------------

# By default, pandas assumes the file is separated by commas (,)
# But sometimes other separators are used

# Example CSV with semicolon (common in Europe):
# name;age;city;salary
# Alice;25;Rotterdam;3200

# In that case, you must specify the separator:
df = pd.read_csv("data.csv", sep=",")


# -------------------------
# Common issues
# -------------------------

# Problem: Wrong separator
# If your columns look wrong (everything in one column),
# you probably need to change 'sep'


# Problem: Encoding errors
# Sometimes CSV files use different text encodings
df = pd.read_csv("data.csv", encoding="utf-8")
# or:
df = pd.read_csv("data.csv", encoding="latin-1")


# -------------------------
# Printing a column (Series)
# -------------------------

# You can access a column by using its name:
city = df["city"]
print(city)
print("====================================================================================")

# Output will look like:
# 0    Rotterdam
# 1    Amsterdam
# 2    Utrecht
# ...

# The numbers on the left (0, 1, 2, ...) are the index.

# What is the index?
# - It is a number automatically added by pandas
# - It shows the position of each row in the table
# - It is NOT part of the actual data (like city or name)

# Think of it like a row number in a table:
# Row 0 = first row
# Row 1 = second row
# etc.

# You can use the index to get a specific value:
print(city[0])  # Rotterdam

0    Rotterdam
1    Amsterdam
2      Utrecht
3    The Hague
4    Eindhoven
5    Rotterdam
6    Amsterdam
7      Utrecht
8    Eindhoven
9    The Hague
Name: city, dtype: object
====================================================================================
Rotterdam


# We can verify the type by printing the type
print(type(city))

<class 'pandas.core.series.Series'>


# -------------------------
# Setting a custom index
# -------------------------

# By default, pandas uses a numeric index (0, 1, 2, ...).
# But you can choose one of the columns to be the index instead.

# Example: using the "name" column as the index
df_indexed = df.set_index("name")

print(df_indexed)

# Now the index will be:
# Alice, Bob, Charlie, etc.
# instead of numbers like 0, 1, 2

         age       city   department  salary  years_experience  is_full_time  \
name                                                                           
Alice     25  Rotterdam    Marketing    3200                 2          True   
Bob       30  Amsterdam  Engineering    4000                 5          True   
Charlie   28    Utrecht        Sales    3500                 3         False   
Diana     22  The Hague    Marketing    2800                 1          True   
Ethan     35  Eindhoven  Engineering    4500                10          True   
Fiona     27  Rotterdam        Sales    3600                 4         False   
George    40  Amsterdam   Management    6000                15          True   
Hannah    24    Utrecht  Engineering    3100                 2          True   
Ivan      31  Eindhoven        Sales    4200                 6         False   
Julia     29  The Hague    Marketing    3700                 5          True   

         start_date  
name                 
Alice    2022-06-15  
Bob      2020-03-10  
Charlie  2021-09-01  
Diana    2023-01-20  
Ethan    2015-11-05  
Fiona    2021-04-18  
George   2010-07-30  
Hannah   2022-10-12  
Ivan     2019-02-25  
Julia    2018-08-09


# -------------------------
# Why use a custom index?
# -------------------------

# Using a meaningful column as the index can make data easier to work with.

# For example:
# - Each person has a unique name in this dataset
# - So we can quickly look up a person by name

# Example:
df_indexed.loc["Alice"]

# This returns all information about Alice in one row.


# -------------------------
# When is this useful?
# -------------------------

# A custom index is useful when:
# - The column has unique values (no duplicates)
# - You want to quickly look up rows by a meaningful label
# - The index represents something "real" (like name, ID, date, etc.)

# In this dataset, "name" works well because:
# - Each row represents one person
# - Names are easy to understand and use for lookup

age                         25
city                 Rotterdam
department           Marketing
salary                    3200
years_experience             2
is_full_time              True
start_date          2022-06-15
Name: Alice, dtype: object


# -------------------------
# We can further drill into Alice's information.
# -------------------------

# Let's say we want to find her age. It would be as simple as accessing her information as we just did and then specifying which column we want to display:
df_indexed.loc["Alice", "age"]

# This works because:
# - "Alice" is the index (row label)
# - "age" is the column name

# We can also access multiple columns for the same person:
df_indexed.loc["Alice", ["age", "city", "salary"]]

# This will return a small table with only the selected columns.

age              25
city      Rotterdam
salary         3200
Name: Alice, dtype: object


# -------------------------
# Loc, column location
# -------------------------

# .loc is used to select data from a DataFrame using labels (names).


# The general format is:
# df.loc[row_label, column_label]

# In this example:
df_indexed.loc["Alice", "age"]

# "Alice"  -> row label (index value)
# "age"    -> column name

25


# So .loc means:
# Go to the row with label 'Alice' and return the value in the 'age' column

# -------------------------
# Simple way to think about it
# -------------------------

# .loc[row, column]

# Row = which person (or entry) you want
# Column = which piece of information you want

# -------------------------
# Example
# -------------------------

# Get Alice's city:
df_indexed.loc["Alice", "city"]

'Rotterdam'


# Get multiple columns for Alice:
df_indexed.loc["Alice", ["age", "city", "salary"]]

age              25
city      Rotterdam
salary         3200
Name: Alice, dtype: object


# IMPORTANT: .loc uses labels (column names), not positions (numbers)

# This will NOT work because 0:2 are positions, not column names:
df_indexed.loc["Alice", 0:2]

# If you want to select columns by position, you should use .iloc instead

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_29212\2695698881.py in <module>
      2 
      3 # This will NOT work because 0:2 are positions, not column names:
----> 4 df_indexed.loc["Alice", 0:2]
      5 
      6 # If you want to select columns by position, you should use .iloc instead

~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
    959             if self._is_scalar_access(key):
    960                 return self.obj._get_value(*key, takeable=self._takeable)
--> 961             return self._getitem_tuple(key)
    962         else:
    963             # we by definition only have the 0th axis

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
   1138         with suppress(IndexingError):
   1139             tup = self._expand_ellipsis(tup)
-> 1140             return self._getitem_lowerdim(tup)
   1141 
   1142         # no multi-index, so validate all of the indexers

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
    889                     return section
    890                 # This is an elided recursive call to iloc/loc
--> 891                 return getattr(section, self.name)[new_key]
    892 
    893         raise IndexingError("not applicable")

~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
    965 
    966             maybe_callable = com.apply_if_callable(key, self.obj)
--> 967             return self._getitem_axis(maybe_callable, axis=axis)
    968 
    969     def _is_scalar_access(self, key: tuple):

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1181         if isinstance(key, slice):
   1182             self._validate_key(key, axis)
-> 1183             return self._get_slice_axis(key, axis=axis)
   1184         elif com.is_bool_indexer(key):
   1185             return self._getbool_axis(key, axis=axis)

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_slice_axis(self, slice_obj, axis)
   1215 
   1216         labels = obj._get_axis(axis)
-> 1217         indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
   1218 
   1219         if isinstance(indexer, slice):

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in slice_indexer(self, start, end, step, kind)
   6286         self._deprecated_arg(kind, "kind", "slice_indexer")
   6287 
-> 6288         start_slice, end_slice = self.slice_locs(start, end, step=step)
   6289 
   6290         # return a slice

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in slice_locs(self, start, end, step, kind)
   6496         start_slice = None
   6497         if start is not None:
-> 6498             start_slice = self.get_slice_bound(start, "left")
   6499         if start_slice is None:
   6500             start_slice = 0

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_slice_bound(self, label, side, kind)
   6405         # For datetime indices label may be a string that has to be converted
   6406         # to datetime boundary according to its resolution.
-> 6407         label = self._maybe_cast_slice_bound(label, side)
   6408 
   6409         # we need to look up the label

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _maybe_cast_slice_bound(self, label, side, kind)
   6352         # reject them, if index does not contain label
   6353         if (is_float(label) or is_integer(label)) and label not in self:
-> 6354             raise self._invalid_indexer("slice", label)
   6355 
   6356         return label

TypeError: cannot do slice indexing on Index with these indexers [0] of type int


# -------------------------
# iloc (index location)
# -------------------------

# .iloc is used to select data based on POSITION (numbers), not names.

# The general format is:
# df.iloc[row_position, column_position]

# -------------------------
# Example
# -------------------------

# Get Alice's row using position:
# This returns the first row in the dataset (Alice) 
# Keep in mind that df_indexed is indexed with Name as the index and Alice is the first name in the list
df_indexed.iloc[0]

age                         25
city                 Rotterdam
department           Marketing
salary                    3200
years_experience             2
is_full_time              True
start_date          2022-06-15
Name: Alice, dtype: object


# Get a specific value using row and column positions:
df_indexed.iloc[0, 1]

# This means:
# - Row at position 0 (first row → Alice)
# - Column at position 1 (second column) this would be city in our dataset

'Rotterdam'


# -------------------------
# Selecting a range of columns
# -------------------------
# Get the first row, and columns 0 to 2:


# Note:
# - 0:3 means columns at positions 0, 1, and 2
# - The end number (3) is NOT included

# -------------------------
# Simple rule
# -------------------------

# .loc  → uses names (labels)
# .iloc → uses positions (numbers)

# Use .iloc when you want to work with column/row positions
# instead of column names


# How to select multiple columns with .iloc
# You need to pass a list of column positions using []:
df_indexed.iloc[0, [0,3,5]]

age               25
salary          3200
is_full_time    True
Name: Alice, dtype: object


# Similarly you can also select multiple rows 
df_indexed.iloc[0:2, [0,3,5]]


# or
df_indexed.iloc[:4, [0,3,5]]

# What does ":" mean in this context?

# ":" means "a range" or "from start to end".

# Example:
# 0:2 means:
# - start at index 0
# - go up to (but not including) index 2
# - rows at positions 0 and 1

# :4 means:
# - start from the beginning
# - go up to (but not including) index 4
# - rows at positions 0, 1, 2, 3

# So:
# df_indexed.iloc[:4, [0,3,5]]
# means:
# - take the first 4 rows
# - and only columns at positions 0, 3, and 5


# Using negative indices (sometimes useful)
df_indexed.iloc[-1]

# This gets the last row in the dataset

age                         29
city                 The Hague
department           Marketing
salary                    3700
years_experience             5
is_full_time              True
start_date          2018-08-09
Name: Julia, dtype: object


# In short. Iloc takes in 2 arguments. The row position and colum position:
# df.iloc[row_position, column_position]


# -------------------------
# Filtering data
# -------------------------

# Filtering means selecting rows based on a condition.

# The basic idea is:
# df[condition]

# The condition returns True or False for each row.
# Only rows with True are kept.

# Example 1: Filter people older than 30

df[df["age"] > 30]


# Example 1: Filter people older than 30

df[df["age"] > 30]


# Example 2: Filter by department

df[df["department"] == "Engineering"]


# Example 3: Filter by multiple conditions
# Filter out everyone that is older than 25 and has a salary higher than 3500
df[(df["age"] > 25) & (df["salary"] > 3500)]


# Example 4: Full-time employees only

df[df["is_full_time"] == True]


# Example 5: Filter by city

df[df["city"] == "Amsterdam"]


# Notes:
# - Use () around each condition when combining them
# - Use:
#   &  for AND
#   |  for OR
# - Do NOT use "and" / "or" with pandas conditions
# - Always write conditions inside df[...]


# -------------------------
# Sorting data
# -------------------------

# You can sort a DataFrame using the sort_values() function.

# The basic format:
# df.sort_values(by="column_name")
# Example 1: Sort by age (ascending)

df.sort_values(by="age")


# Example 2: Sort by salary (highest first)

df.sort_values(by="salary", ascending=False)


# Example 3: Sort by multiple columns

df.sort_values(by=["department", "salary"])


# -------------------------
# Combining filtering and sorting
# -------------------------

# You can first filter data, and then sort the result.

# Example: Show Engineering employees sorted by salary (highest first)

df[df["department"] == "Engineering"].sort_values(by="salary", ascending=False)


# Example: Employees older than 25, sorted by age
# Note that we call a function on top of our initial filtering
df[df["age"] > 25].sort_values(by="age")


# The code can be split into 2.
# 1.First the filtering:
filtered_result = df[df["age"] > 25 ]


#.2 Then we sort
filtered_result.sort_values(by="age")


# sort_values is smart enough to also work with strings.
# Let's say we want to filter the cities alphabetically.
# In that case we can simply pass city in function
filtered_result.sort_values(by="city")


# -------------------------
# Using isin() for multiple values
# -------------------------

# isin() is useful when you want to filter multiple values from one column.

# Example: Filter specific cities
# Note we're passing a list to the isin function -> ["Amsterdam", "Rotterdam"]

df[df["city"].isin(["Amsterdam", "Rotterdam"])]


# Example: Filter multiple departments

df[df["department"].isin(["Engineering", "Sales"])]


# -------------------------
# Grouping data (groupby)
# -------------------------

# groupby() is used to group data based on a column,
# and then perform calculations on each group.

# Basic idea:
# - Split data into groups
# - Apply a calculation
# - Combine the result


# Example 1: Average salary per department

df.groupby("department")["salary"].mean()

department
Engineering    3866.666667
Management     6000.000000
Marketing      3233.333333
Sales          3766.666667
Name: salary, dtype: float64


# Let's disect what happens here with our dataset
# The csv contains the following data:

# name,age,city,department,salary,years_experience,is_full_time,start_date
# Alice,25,Rotterdam,Marketing,3200,2,True,2022-06-15
# Bob,30,Amsterdam,Engineering,4000,5,True,2020-03-10
# Charlie,28,Utrecht,Sales,3500,3,False,2021-09-01
# Diana,22,The Hague,Marketing,2800,1,True,2023-01-20
# Ethan,35,Eindhoven,Engineering,4500,10,True,2015-11-05
# Fiona,27,Rotterdam,Sales,3600,4,False,2021-04-18
# George,40,Amsterdam,Management,6000,15,True,2010-07-30
# Hannah,24,Utrecht,Engineering,3100,2,True,2022-10-12
# Ivan,31,Eindhoven,Sales,4200,6,False,2019-02-25
# Julia,29,The Hague,Marketing,3700,5,True,2018-08-09

# We are grouping everyone in each existing department. So we have x groups:
# 1. Marketing
# 2. Engineering
# 3. Sales
# 4. Management

# Then we take the mean salary of the people in the individual departments


# Example 2: Count how many people are in each city

df.groupby("city")["name"].count()

city
Amsterdam    2
Eindhoven    2
Rotterdam    2
The Hague    2
Utrecht      2
Name: name, dtype: int64


# Example 3: Average age per department

df.groupby("department")["age"].mean()

department
Engineering    29.666667
Management     40.000000
Marketing      25.333333
Sales          28.666667
Name: age, dtype: float64


# Example 4: Multiple calculations at once

df.groupby("department")["salary"].agg(["mean", "min", "max"])


# Example 5: Group by multiple columns

df.groupby(["department", "city"])["salary"].mean()

department   city     
Engineering  Amsterdam    4000.0
             Eindhoven    4500.0
             Utrecht      3100.0
Management   Amsterdam    6000.0
Marketing    Rotterdam    3200.0
             The Hague    3250.0
Sales        Eindhoven    4200.0
             Rotterdam    3600.0
             Utrecht      3500.0
Name: salary, dtype: float64


# Add column to dataframe
# we can add a new column to an existing dataframe by calling the dataframe with a new column name
# We can also make use 
df["yearly_salary"]=df["salary"]*12
df


# Notes:
# - groupby() groups rows that have the same value in a column
# - After grouping, you usually apply a function like:
#   mean(), count(), min(), max()

# - You can select a column after groupby:
#   df.groupby("column")["another_column"].function()

# - Useful functions:
#   mean()  → average
#   sum()   → total
#   count() → number of rows
#   min()   → smallest value
#   max()   → largest value


# Multi index
# You can create a multi index by passing a list of indices to your set_index function
df_multi=df.set_index(["department", "city", "salary"])
df_multi


# -------------------------
# Using .loc with a multi index
# -------------------------

# When using a multi index, you select rows by passing a tuple:
# (level_1, level_2, ...)

df_multi.loc[("Marketing", "The Hague"), :]

# This means:
# - department = "Marketing"
# - city = "The Hague"
# - ":" → all remaining levels and all columns

# Since we did not specify "salary" (3rd index level),
# pandas will return all rows that match:
# department = Marketing AND city = The Hague

C:\Users\romar\AppData\Local\Temp\ipykernel_29212\2697545327.py:8: PerformanceWarning: indexing past lexsort depth may impact performance.
  df_multi.loc[("Marketing", "The Hague"), :]


# -------------------------
# Selecting multiple values in a multi index
# -------------------------

# You can also pass a list inside the tuple

# Example:
df_multi.loc[("Marketing", ["The Hague", "Rotterdam"]), :]

# This means:
# - department = "Marketing"
# - city = either "The Hague" OR "Rotterdam"


# -------------------------
# Selecting multiple values in a multi index and filter out the colums
# -------------------------
df_multi.loc[("Marketing", ["The Hague", "Rotterdam"]), ["name","is_full_time"]]

# Explanation:

# ("Marketing", ["The Hague", "Rotterdam"]) → row selection
# - "Marketing" filters the first index level (department)
# - ["The Hague", "Rotterdam"] filters the second index level (city)
# - This returns rows where:
#   department = Marketing AND city is either The Hague or Rotterdam

# ["name"] → column selection
# - We only return the "name" column from the result

# Important:
# - The row part uses the multi index (department, city, salary)
# - The column part must use column names (like "name"), not numbers

# Result:
# A smaller table (DataFrame) with:
# - only selected rows
# - only the "name" column


# -------------------------
# Series vs DataFrame
# -------------------------

# A Series and a DataFrame are both data structures in pandas,
# but they are used in slightly different situations.

# -------------------------
# Series
# -------------------------

# A Series is a single row OR a single column.

# Example:
df.loc[0]

# This returns one row as a Series.

# You can think of a Series as a 1D structure (a single line of data).

name                     Alice
age                         25
city                 Rotterdam
department           Marketing
salary                    3200
years_experience             2
is_full_time              True
start_date          2022-06-15
Name: 0, dtype: object


# -------------------------
# DataFrame
# -------------------------

# A DataFrame is a table with rows and columns.

# Example:
df.loc[[0, 3]]

# This returns multiple rows as a DataFrame.

# You can think of a DataFrame as a 2D structure (a table).


# -------------------------
# Simple rule
# -------------------------

# - One row → Series
# - Multiple rows → DataFrame

# Even a single row can be a DataFrame if you use:
df.loc[[0]]

<class 'pandas.core.series.Series'>


# You can verify by checking the type
print(type(df.loc[[0]]))

<class 'pandas.core.frame.DataFrame'>


# -------------------------
# Introduction to matplotlib
# -------------------------

# matplotlib is a library used to create visualizations (graphs)
# This helps to better understand and present data

# We usually use matplotlib together with pandas

import matplotlib.pyplot as plt


# -------------------------
# Basic plot
# -------------------------

# A basic plot shows values on a graph

# Example: Plot age values

df["age"].plot()

plt.show()

# What happens here:

# df["age"].plot()
# → pandas creates a simple line plot using the "age" column

# plt.show()
# → displays the plot


# -------------------------
# Scatter plot (correlation)
# -------------------------

# A scatter plot is used to show the relationship between two variables

# Example: years of experience vs salary

plt.scatter(df["years_experience"], df["salary"])

plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.title("Salary vs Years of Experience")

plt.show()


# For sake of reusability you can also assign variables to the data properties:
salary = df['salary']
years_experience = df['years_experience']
plt.scatter(years_experience, salary)
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.title("Salary vs Years of Experience")

Text(0.5, 1.0, 'Salary vs Years of Experience')


# -------------------------
# Correlation (np.corrcoef)
# -------------------------

# Correlation measures how strongly two variables are related

# We use numpy for this
import numpy as np


# Example: correlation between years_experience and salary

years_experience = df["years_experience"]
salary = df["salary"]

np.corrcoef(years_experience, salary)

array([[1.        , 0.98217683],
       [0.98217683, 1.        ]])


# Extracting the actual correlation value
corr = np.corrcoef(years_experience, salary)[0, 1]
print(corr)

0.9821768330299293


# Correlation values range from -1 to 1

#  1   → strong positive correlation (goes up together)
#  0   → no correlation
# -1   → strong negative correlation (one goes up, other goes down)

# In our case the correlation is very close to 1 meaning that the more years of experience the employee has, the higher the salary


# -------------------------
# Histogram
# -------------------------

# A histogram shows how values are distributed (spread out)

# Example: distribution of salaries

plt.hist(df["salary"])

plt.show()


# -------------------------
# Number of bins
# -------------------------

# Bins are the "bars" in the histogram

# More bins → more detail
# Fewer bins → simpler view

# Example with 5 bins:
plt.hist(df["salary"], bins=5)

plt.show()


# -------------------------
# Labels and title
# -------------------------

plt.hist(df["salary"], bins=5)

plt.xlabel("Salary")
plt.ylabel("Number of Employees")
plt.title("Distribution of Salaries")

plt.show()


# -------------------------
# Tick values and labels
# -------------------------

# tick_val = positions on the axis
# tick_lab = labels shown at those positions

# Example:
tick_val = [2000, 3000, 4000, 5000, 6000]
tick_lab = ["2k", "3k", "4k", "5k", "6k"]


# -------------------------
# Apply custom ticks on x-axis
# -------------------------

plt.hist(df["salary"], bins=5)

plt.xlabel("Salary")
plt.ylabel("Number of Employees")
plt.title("Distribution of Salaries")

plt.xticks(tick_val, tick_lab)

plt.show()


# Calculating mean, median, sum etc in a histogram

mean_salary = df['salary'].mean()
sum_salary=df['salary'].sum()
median=df['salary'].median()
print(mean_salary)
print(sum_salary)
print(median)

3860.0
38600
3650.0


# Convert date
# In our data the start_date shows a date but is actually of type string.
df=pd.read_csv("data.csv")
df["start_date"].apply(type).unique()

array([<class 'str'>], dtype=object)


df["start_date"] = pd.to_datetime(df["start_date"])


# -------------------------
# Convert start_date to datetime
# -------------------------

# Convert the column from string → datetime
df["start_date"] = pd.to_datetime(df["start_date"])

# Check the result
print(df["start_date"].dtype)

# After conversion, the dtype should be:
# datetime64[ns]

# This means pandas now treats the values as real dates,
# not as plain text (strings)

datetime64[ns]


# change date time format 

# -------------------------
# Changing datetime format
# -------------------------

# First, convert the column to datetime
df["start_date"] = pd.to_datetime(df["start_date"])


# Then, change how the date is displayed using strftime
df['start_date'] = df['start_date'].dt.strftime('%d-%m-%Y')

# Notes:

# - pd.to_datetime() converts the column into a real datetime type
# - .dt.strftime() changes the DISPLAY format of the date

# Format codes:
# %d → day (01–31)
# %m → month (01–12)
# %Y → full year (e.g. 2022)

# Important:
# After using strftime, the column becomes a string again
# (not a datetime object anymore)

# So:
# - Use datetime format for calculations and filtering
# - Use strftime only when you want to display dates in a specific format
df


# -------------------------
# Example: working with dates
# -------------------------

# Extracting year
df["start_year"] = df["start_date"].dt.year

# Extracting the month
df["start_month"] = df["start_date"].dt.month

# # Extracting the day
df["start_day"] = df["start_date"].dt.day


df
# We should now have 3 more columns. start_year, start_month,start_day


# Create a csv
# This will create a csv file
df.to_csv('dataframe.csv')


# -------------------------
# Calculating the average from a mixed list
# -------------------------

# This list contains both names (strings) and numbers (ages)
data = ['Anna', 21, 'John', 30, 'Maria', 25, 'David', 28, 'Lisa', 22]

# Variable to store the total sum of numbers
total = 0

# Variable to count how many numbers we have
count = 0

# Loop through each item in the list
for item in data:

    # Check if the item is a number (int or float)
    if type(item) == int or type(item) == float:
        total += item
        count += 1

# Calculate the average
average = total / count

print(average)

# What is happening step by step:

# 1. We go through each item in the list
# 2. We check: is this item a number?
# 3. If yes:
#    - add it to total
#    - increase count
# 4. After the loop:
#    - divide total by count → average

# The list contains strings AND numbers

# If we try:
# total += "Anna"
# → this would cause an error

# So we must check the type first

25.2

	age	salary	years_experience
count	10.0000	10.00000	10.000000
mean	29.1000	3860.00000	5.300000
std	5.3427	911.89668	4.270051
min	22.0000	2800.00000	1.000000
25%	25.5000	3275.00000	2.250000
50%	28.5000	3650.00000	4.500000
75%	30.7500	4150.00000	5.750000
max	40.0000	6000.00000	15.000000

	name	age	city	department	salary	years_experience	is_full_time	start_date
0	Alice	25	Rotterdam	Marketing	3200	2	True	2022-06-15
1	Bob	30	Amsterdam	Engineering	4000	5	True	2020-03-10
2	Charlie	28	Utrecht	Sales	3500	3	False	2021-09-01
3	Diana	22	The Hague	Marketing	2800	1	True	2023-01-20
4	Ethan	35	Eindhoven	Engineering	4500	10	True	2015-11-05

	age	salary	is_full_time
name
Alice	25	3200	True
Bob	30	4000	True
Charlie	28	3500	False
Diana	22	2800	True

	name	age	city	department	salary	years_experience	is_full_time	start_date
6	George	40	Amsterdam	Management	6000	15	True	2010-07-30
4	Ethan	35	Eindhoven	Engineering	4500	10	True	2015-11-05
8	Ivan	31	Eindhoven	Sales	4200	6	False	2019-02-25
1	Bob	30	Amsterdam	Engineering	4000	5	True	2020-03-10
9	Julia	29	The Hague	Marketing	3700	5	True	2018-08-09
5	Fiona	27	Rotterdam	Sales	3600	4	False	2021-04-18
2	Charlie	28	Utrecht	Sales	3500	3	False	2021-09-01
0	Alice	25	Rotterdam	Marketing	3200	2	True	2022-06-15
7	Hannah	24	Utrecht	Engineering	3100	2	True	2022-10-12
3	Diana	22	The Hague	Marketing	2800	1	True	2023-01-20

	mean	min	max
department
Engineering	3866.666667	3100	4500
Management	6000.000000	6000	6000
Marketing	3233.333333	2800	3700
Sales	3766.666667	3500	4200

			name	age	years_experience	is_full_time	start_date
department	city	salary
Marketing	Rotterdam	3200	Alice	25	2	True	2022-06-15
Engineering	Amsterdam	4000	Bob	30	5	True	2020-03-10
Sales	Utrecht	3500	Charlie	28	3	False	2021-09-01
Marketing	The Hague	2800	Diana	22	1	True	2023-01-20
Engineering	Eindhoven	4500	Ethan	35	10	True	2015-11-05
Sales	Rotterdam	3600	Fiona	27	4	False	2021-04-18
Management	Amsterdam	6000	George	40	15	True	2010-07-30
Engineering	Utrecht	3100	Hannah	24	2	True	2022-10-12
Sales	Eindhoven	4200	Ivan	31	6	False	2019-02-25
Marketing	The Hague	3700	Julia	29	5	True	2018-08-09