1. Python Fundamentals (Day 1)¶
All code will be shared in the github page
If you have questions, Please ask or post in the github issue
1.1 Setup Python¶
(a) Install from Anaconda Distribution
(b) Open anaconda prompt /terminal¶

Jupyter notebook --> run code (Ctrl + Enter) or (Alt+Enter)

1.2 Basic Python syntax and data types¶
| Syntax | Description |
|---|---|
print("Hello, World!") |
Print the string "Hello, World!" |
# This is a comment |
Single-line comment |
"""Multiline string""" |
Multiline string (used for documentation) |
x = 5 |
Assign the value 5 to variable x |
if/for/while condition: |
Conditional statement |
import module |
Import a Python module |
from module import x |
Import specific variable/function from a module |
Data type¶
| Data Type | Description | Example |
|---|---|---|
| int | Integer (whole number) | 42 |
| float | Floating-point (decimal) | 3.14 |
| str | String (text) | 'Hello, World!' |
| bool | Boolean (True/False) | True or False |
| list | List (ordered collection) | [1, 2, 3] |
| tuple | Tuple (immutable collection) | (1, 'apple', 3.14) |
| set | Set (unordered collection) | {1, 2, 3} |
| dict | Dictionary (key-value pairs) | {'name': 'John', 'age': 30} |
| NoneType | None (represents absence) | None |
Python Reserved Words¶
| Keyword | Keyword | Keyword |
|---|
| if/else/elif/for/ while /return | def | pass/break /continue | is /as/not | in |import /from | try/yield |
print('This is my first python code')
This is my first python code
name= ' your name'
print(name)
your name
# Variables and Data Types:
name = "John" # name is a variable of type string
age = 30 # age is a variable of type integer
height = 6.2 # height is a variable of type float
is_student = False # is_student is a variable of type boolean
print(name)
print(age)
print(height)
print(is_student)
John 30 6.2 False
print(type(name))
<class 'str'>
# show the type of each variable
print(type(name))
print(type(age))
print(type(height))
print(type(is_student))
<class 'str'> <class 'int'> <class 'float'> <class 'bool'>
# print all variables in one line
print(name, age, height, is_student)
# or
print(name + " " + str(age) + " " + str(height) + " " + str(is_student))
# or f-string
print(f"{name} {age} {height} {is_student}")
John 30 6.2 False John 30 6.2 False John 30 6.2 False
print('your name','30')
your name 30
Operators¶
30*10
300
# Arithmetic Operators
x = 10
y = 5
addition_result = x + y
print("Addition:", addition_result) # Output: 15
subtraction_result = x - y
print("Subtraction:", subtraction_result) # Output: 5
multiplication_result = x * y
print("Multiplication:", multiplication_result) # Output: 50
division_result = x / y
print("Division:", division_result) # Output: 2.0
exponentiation_result = x ** y
print("Exponentiation:", exponentiation_result) # Output: 100000
Addition: 15 Subtraction: 5 Multiplication: 50 Division: 2.0 Exponentiation: 100000
# Arithmetic Operators
x = 10
y = 5
addition_result = x + y
print("Addition:", addition_result) # Output: 15
subtraction_result = x - y
print("Subtraction:", subtraction_result) # Output: 5
multiplication_result = x * y
print("Multiplication:", multiplication_result) # Output: 50
division_result = x / y
print("Division:", division_result) # Output: 2.0
exponentiation_result = x ** y
print("Exponentiation:", exponentiation_result) # Output: 100000
floor_division_result = x // y
print("Floor Division:", floor_division_result) # Output: 2
# Bitwise Operators
a = 5 # Binary: 101
b = 3 # Binary: 011
bitwise_and_result = a & b
print("Bitwise AND:", bitwise_and_result) # Output: 1 (Binary: 001)
bitwise_or_result = a | b
print("Bitwise OR:", bitwise_or_result) # Output: 7 (Binary: 111)
bitwise_xor_result = a ^ b
print("Bitwise XOR:", bitwise_xor_result) # Output: 6 (Binary: 110)
bitwise_left_shift_result = a << 1
print("Bitwise Left Shift:", bitwise_left_shift_result) # Output: 10 (Binary: 1010)
bitwise_right_shift_result = a >> 1
print("Bitwise Right Shift:", bitwise_right_shift_result) # Output: 2 (Binary: 10)
# Comparison Operators
x = 10
y = 5
less_than_or_equal = x <= y
print("Less than or equal to:", less_than_or_equal) # Output: False
greater_than = x > y
print("Greater than:", greater_than) # Output: True
greater_than_or_equal = x >= y
print("Greater than or equal to:", greater_than_or_equal) # Output: True
not_equal = x != y
print("Not equal to:", not_equal) # Output: True
equal_to = x == y
print("Equal to:", equal_to) # Output: False
# Assignment Expression
if (n := len("hello")) > 4:
print(f"The length of 'hello' is {n}.") # Output: "The length of 'hello' is 5."
# Matrix Multiplication Operator
import numpy as np
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])
matrix_product = matrix_a @ matrix_b
print("Matrix Product:")
print(matrix_product)
Addition: 15 Subtraction: 5 Multiplication: 50 Division: 2.0 Exponentiation: 100000 Floor Division: 2 Bitwise AND: 1 Bitwise OR: 7 Bitwise XOR: 6 Bitwise Left Shift: 10 Bitwise Right Shift: 2 Less than or equal to: False Greater than: True Greater than or equal to: True Not equal to: True Equal to: False The length of 'hello' is 5. Matrix Product: [[19 22] [43 50]]
# list
fruits = ["apple", "banana", "cherry"]
print(fruits)
['apple', 'banana', 'cherry']
mix_list=['apple', 3, 'banana', 4, 'cherry', 5]
print(mix_list)
['apple', 3, 'banana', 4, 'cherry', 5]
# check data type
print(type(fruits))
<class 'list'>
2. Dictionaries:¶
#Dictionaries:
person = {"name": "Alice", "age": 25, "city": "New York"}
print(type(person))
print(person["name"]) # Accessing values
person["occupation"] = "Engineer" # Adding key-value pairs
<class 'dict'> Alice
person.keys() # Accessing keys
dict_keys(['name', 'age', 'city', 'occupation'])
person.values() # Accessing values
dict_values(['Alice', 25, 'New York', 'Engineer'])
Add function from built-in library or third-party library¶
# Add function from built-in library or third-party library
from math import sqrt
print(sqrt(25))
5.0
# list of numbers
numbers = [1, 4, 9]
# square root of each number
roots = [sqrt(n) for n in numbers]
print(roots)
[1.0, 2.0, 3.0]
D¶
# Define your own function
def my_square(n):
return n ** 2
print(my_square(5))
25
# or
def greet(name):
return "Hello, " + name + "!"
result = greet("Bob")
print(result)
Hello, Bob!
Accessing items from list and Dictionary¶
Accessing, Indexing, Adding, and Deleting Items Cheatsheet¶
Lists¶
| Operation | Code Example | Description |
|---|---|---|
| Accessing an element by index | my_list[index] |
Retrieves the element at the specified index in the list. |
| Slicing | my_list[start:stop:step] |
Extracts a portion of the list based on start, stop, and step. |
| Accessing the last element | my_list[-1] |
Accesses the last element of the list. |
| Adding an item at the end | my_list.append(item) |
Appends an item to the end of the list. |
| Inserting an item at an index | my_list.insert(index, item) |
Inserts an item at the specified index in the list. |
| Removing an item by value | my_list.remove(item) |
Removes the first occurrence of the item with the given value. |
| Removing an item by index | del my_list[index] |
Deletes the item at the specified index. |
| Checking if an item exists | item in my_list |
Checks if an item exists in the list. |
| Finding the index of an item | my_list.index(item) |
Returns the index of the first occurrence of the item. |
Dictionaries¶
| Operation | Code Example | Description |
|---|---|---|
| Accessing a value by key | my_dict[key] |
Retrieves the value associated with the specified key. |
| Adding a key-value pair | my_dict[key] = value |
Inserts a new key-value pair into the dictionary. |
| Removing a key-value pair | del my_dict[key] |
Deletes the key-value pair with the specified key. |
| Checking if a key exists | key in my_dict |
Checks if a key exists in the dictionary. |
| Getting all keys | my_dict.keys() |
Returns a list of all keys in the dictionary. |
| Getting all values | my_dict.values() |
Returns a list of all values in the dictionary. |
| Getting key-value pairs | my_dict.items() |
Returns a list of key-value pairs as tuples. |
# Creating a sample list
# here 1=0, 7=1,3=2
my_list = [1, 7, 3, 4, 5]
# Accessing an element by index
my_list[1]
7
# Slicing
my_list[1:4]
[7, 3, 4]
# Accessing the last element
my_list[-1]
5
# Adding an item at the end
my_list.append(11)
my_list
[1, 7, 3, 4, 5, 11]
# Inserting an item at an index
my_list.insert(2,88) # 2 is the index, 7 is the value
my_list
[1, 7, 88, 3, 4, 5, 11]
# Removing an item by value
my_list.remove(4)
my_list
[1, 7, 88, 3, 5, 11]
# Creating a sample dictionary
my_dict = {'name': 'John', 'age': 30, 'city': 'New York'}
my_dict # keys and values
{'name': 'John', 'age': 30, 'city': 'New York'}
# print all keys
my_dict.keys()
dict_keys(['name', 'age', 'city'])
# print all values
my_dict.values()
dict_values(['John', 30, 'New York'])
# Accessing a value by key
my_dict['age']
30
# Adding a key-value pair
my_dict['job'] = 'Engineer'
my_dict
{'name': 'John', 'age': 30, 'city': 'New York', 'job': 'Engineer'}
# Removing a key-value pair
del my_dict['city']
my_dict
{'name': 'John', 'age': 30, 'job': 'Engineer'}
# merge two dictionaries
my_dict1 = {'name': 'John', 'age': 30}
my_dict2 = {'city': 'New York', 'job': 'Engineer'}
my_dict1.update(my_dict2)
my_dict1
{'name': 'John', 'age': 30, 'city': 'New York', 'job': 'Engineer'}
# merge two lists
my_list1 = [1, 2, 3]
my_list2 = [4, 5, 6]
my_list1.extend(my_list2)
my_list1
[1, 2, 3, 4, 5, 6]
# add two lists with different length,differnt data type
my_list1 = [1, 2, 3]
my_list2 = [4, 5, 6, 7]
# string
my_list3 = ['a', 'b', 'c']
my_list1 + my_list2 # numbers
my_list1 + my_list3 # numbers and string
[1, 2, 3, 'a', 'b', 'c']
my_list1 + my_list2 # numbers
[1, 2, 3, 4, 5, 6, 7]
my_list1 + my_list3
[1, 2, 3, 'a', 'b', 'c']
Control: if / for /while¶
#Conditional Statements (if-else):
x = 5
if x > 5:
print("x is greater than 5")
elif x == 5:
print("x is equal to 5")
else:
print("x is not greater than 5")
x is equal to 5
# Example using a for loop
fruits = ["apple", "banana", "cherry"]
for item in fruits:
print(item)
apple banana cherry
# alternative way in one line ; list comprehension : [expression for item in list]
[print(fruit) for fruit in fruits]
apple banana cherry
[None, None, None]
# Example using a for loop with range: range is a built-in function that returns a sequence of numbers
for i in range(1, 6):
print(i)
# alternative way in one line
[i for i in range(1, 6)]
1 2 3 4 5
[1, 2, 3, 4, 5]
# Example using a while loop
count = 0 # initialize count to 0
while count < 6: # condition: continue to run until condition is false
print(count) # print count
count += 1 # increment count by 1
# this is equivalent to count = count + 1, it will keep adding 1 to count until count = 5
0 1 2 3 4 5
# Example using a while loop with break and continue
num = 0
while num < 10:
num += 1
if num == 5:
continue # Skip iteration when num is 5 which means it will not print 5
print(num)
if num == 8:
break # Exit the loop when num is 8 which means it will not print 9 and 10
1 2 3 4 6 7 8
Date time¶
import datetime
datetime.datetime.now()
datetime.datetime(2024, 1, 8, 8, 29, 29, 628952)
# datetime
date='2020-01-01'
date = datetime.datetime.strptime(date, '%Y-%m-%d')
print(date)
print(type(date))
2020-01-01 00:00:00 <class 'datetime.datetime'>
# get month, day, year
print(date.month)
print(date.day)
print(date.year)
1 1 2020
Day 2 Advanced Python concepts¶
- Function Definition:
** def ** : Keyword used to define a function. function_name: Name of the function. Follows the same rules as variable names. (parameter1, parameter2, ...): Parameters that the function takes as input. These are optional.
- Docstring:
Triple-quoted string immediately below the function definition. Provides documentation for the function. Describes what the function does, explains the purpose of parameters, and specifies the return value. Not mandatory but considered good practice for code documentation.
- Parameters:
List of parameters inside the parentheses. Parameters are placeholders for values that the function expects to receive when called. Each parameter has a name and a type annotation (e.g., parameter1 (type)). Descriptions of each parameter explain what values they should represent.
- Function Body:
Contains the actual code that defines the functionality of the function. Can include statements, expressions, conditionals, loops, etc. This is where the main work of the function is done.
- Return Statement:
return: Keyword used to specify the result that the function should provide back to the caller. result_value: The value or object that the function returns. The return statement is optional. If omitted, the function returns None by default.
def add_numbers(x, y):
"""
Adds two numbers and returns the result.
Parameters:
- x (int or float): The first number.
- y (int or float): The second number.
Returns:
- float: The sum of x and y.
"""
result = x + y
return result
# call function
add_numbers(5, 10)
15
import math
def calculate_factorial(n):
"""
Calculate the factorial of a given number.
"""
return math.factorial(n)
print(calculate_factorial(5))
120
def factorial(n):
"""
Calculates the factorial of a number using recursion.
"""
if n == 0 or n == 1:
return 1
else:
return n * factorial(n-1)
# Call the function
factorial_result = factorial(5)
print(f"The factorial is: {factorial_result}")
The factorial is: 120
def add_numbers(a, b):
"""Adds two numbers."""
return a + b
result = add_numbers(3, 5)
print(result)
add = lambda x, y: x + y
result = add(3, 5)
print(result)
8
def calculator(operation):
def add(x, y):
return x + y
def subtract(x, y):
return x - y
if operation == "add":
return add
elif operation == "subtract":
return subtract
else:
return None
add_function = calculator("add")
result_add = add_function(3, 4)
subtract_function = calculator("subtract")
result_subtract = subtract_function(7, 2)
print(result_add) # Output: 7
print(result_subtract) # Output: 5
7 5
def read_and_print_file(file_path):
"""
Reads and prints lines from a text file.
"""
try:
with open(file_path, 'r') as file:
for line in file:
print(line.strip())
except FileNotFoundError:
print(f"File not found: {file_path}")
# Test the function
read_and_print_file('example.txt')
# Assumes 'example.txt' contains some lines of text
File not found: example.txt
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
mean_value = np.mean(arr)
print(mean_value)
3.0
#mymodule.py:
def greet(name):
"""Prints a greeting."""
print(f"Hello, {name}!")
def square(x):
"""Returns the square of a number."""
return x ** 2
# import mymodule
# mymodule.greet("Alice") # Output: Hello, Alice!
# result = mymodule.square(4)
# print(result) # Output: 16
Some special functions and built-in functions in Python, including map, zip, filter, and reduce¶
They are built-in functions in Python that provide powerful tools for working with sequences and collections.
Key Differences:¶
Output Type:¶
map(),filter(): Return iterable data structures like lists.zip(): Returns an iterable of tuples.reduce(): Returns a single value.
Functionality:¶
map(): Applies a function to each element independently.zip(): Combines elements from multiple iterables.filter(): Selectively includes elements based on a condition.reduce(): Aggregates values using a binary function cumulatively.
Use Cases:¶
- Use
map()when you want to transform each element in an iterable. - Use
zip()when you want to combine elements from different iterables. - Use
filter()when you want to selectively include elements based on a condit
numbers = [1, 2, 3, 4, 5]
# use of map without lambda
def square(x):
return x**2
squared = map(square, numbers) # map(function, iterable)
print(list(squared))
[1, 4, 9, 16, 25]
# Using map to square a list of numbers
numbers = [1, 2, 3, 4, 5]
squared = map(lambda x: x**2, numbers) # map(function, iterable)
print(list(squared))
names = ["Alice", "Bob", "Charlie"]
scores = [95, 87, 92]
zipped_data = zip(names, scores) # zip(iterable1, iterable2)
zipped_list = list(zipped_data)
print(zipped_list) #
[('Alice', 95), ('Bob', 87), ('Charlie', 92)]
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9]
# Filter even numbers
even_numbers = filter(lambda x: x % 2 == 0, numbers) # filter(function, iterable)
even_list = list(even_numbers)
print(even_list)
[2, 4, 6, 8]
from functools import reduce
numbers = [1, 2, 3, 4, 5]
product = reduce(lambda x, y: x * y, numbers) # reduce(function, iterable)
print(product)
120
| Regular Expression Pattern | Description |
| --------------------------- | ------------------------------------------------ |
| . | Matches any character except a newline |
| ^ | Matches the start of a string |
| $ | Matches the end of a string |
| * | Matches 0 or more occurrences of the preceding character |
| + | Matches 1 or more occurrences of the preceding character |
| ? | Matches 0 or 1 occurrence of the preceding character |
| \d | Matches any digit (0-9) |
| \D | Matches any non-digit character |
| \w | Matches any word character (alphanumeric or underscore) |
| \W | Matches any non-word character |
| \s | Matches any whitespace character |
| \S | Matches any non-whitespace character |
| [abc] | Matches any character a, b, or c |
| [^abc] | Matches any character except a, b, or c |
| [a-z] | Matches any lowercase letter |
| [A-Z] | Matches any uppercase letter |
| [0-9] | Matches any digit (0-9) |
| (abc) | Groups and captures the enclosed pattern |
| | | Acts as a logical OR for patterns |
| \b | Matches a word boundary |
| \B | Matches a non-word boundary |
# simple example of regular expression
import re
pattern = r"Cookie"
sequence = "Cookie"
if re.match(pattern, sequence):
print("Match!")
Match!
# example of regular expression
txt = "The rain in starkville is good"
x = re.split("\s", txt)
print(x)
['The', 'rain', 'in', 'starkville', 'is', 'good']
import re
# Sample text containing phone numbers
text = "Please contact us at 123-456-7890 or 555-555-5555 for assistance."
# Define a regular expression pattern for matching phone numbers
pattern = r'\d{3}-\d{3}-\d{4}'
# Use the findall method to find all matches in the text
phone_numbers = re.findall(pattern, text)
print(phone_numbers)
['123-456-7890', '555-555-5555']
# Sample text containing email addresses
text = "Please contact support@example.com for assistance or john.doe@email.co for more information."
# Define a regular expression pattern for matching email addresses
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
# Use the findall method to find all matches in the text
email_addresses = re.findall(pattern, text)
# Print the extracted email addresses
for email in email_addresses:
print(email)
support@example.com john.doe@email.co
2. Numpy and Data Analysis with Pandas (Week 2)¶
Numpy¶
| NumPy Function | Description |
|---|---|
np.array([1, 2, 3]) |
Create a 1D array |
np.zeros((2, 3)) |
Create a 2D array filled with zeros |
np.ones((2, 3)) |
Create a 2D array filled with ones |
np.arange(0, 10, 2) |
Create an array with a range of values |
np.linspace(0, 1, 5) |
Create an array with evenly spaced values |
np.eye(3) |
Create a 3x3 identity matrix |
np.random.rand(2, 2) |
Create a 2x2 array with random values [0, 1] |
np.sum(arr) |
Sum of all elements in the array |
np.mean(arr) |
Mean of all elements in the array |
np.max(arr) |
Maximum value in the array |
np.min(arr) |
Minimum value in the array |
np.argmax(arr) |
Index of the maximum value |
np.argmin(arr) |
Index of the minimum value |
np.reshape(arr, (2, 3)) |
Reshape the array |
np.transpose(arr) |
Transpose the array |
np.dot(arr1, arr2) |
Dot product of two arrays |
np.concatenate((a, b), axis=0) |
Concatenate arrays vertically (axis=0) |
np.vstack((a, b)) |
Stack arrays vertically |
np.hstack((a, b)) |
Stack arrays horizontally |
# import built-in libraries
import math
import random
# importing pandas library,numpy library: Third-party libraries
import pandas as pd
import numpy as np
# add numpy library as np
import numpy as np
# Create a 1D array from a Python list
arr_1d = np.array([1, 2, 3, 4, 5])
print(arr_1d)
[1 2 3 4 5]
# shape of array
arr_1d.shape
(5,)
# Create a 2D array (matrix)
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(matrix)
# shape of matrix with fstring
print(f"Shape: {matrix.shape}, Rows: {matrix.shape[0]}, Columns: {matrix.shape[1]}")
[[1 2 3] [4 5 6] [7 8 9]] Shape: (3, 3), Rows: 3, Columns: 3
# Create an array of zeros
zeros = np.zeros((2, 3)) # Creates a 2x3 array of zeros
# Create an array of ones
ones = np.ones((3, 2)) # Creates a 3x2 array of ones
# Create an identity matrix
identity = np.eye(3) # Creates a 3x3 identity matrix
print(f'Zeros:\n{zeros}\n')
print(f'Ones:\n{ones}\n')
print(f'Identity Matrix:\n{identity}\n')
Zeros: [[0. 0. 0.] [0. 0. 0.]] Ones: [[1. 1.] [1. 1.] [1. 1.]] Identity Matrix: [[1. 0. 0.] [0. 1. 0.] [0. 0. 1.]]
Array Operations via numpy¶
# Element-wise addition
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
result = a + b
print(result)
[5 7 9]
# Matrix multiplication
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])
result_matrix = np.dot(matrix_a, matrix_b)
print(result_matrix)
[[19 22] [43 50]]
# Indexing
arr = np.array([10, 20, 30, 40, 50])
arr[2] # Accessing the element at index
30
# Slicing
arr[1:4] # Getting elements from index 1 to 3 (exclusive)
array([20, 30, 40])
# mean, median, standard deviation
arr = np.array([1, 2, 3, 4, 5])
print(f'Mean: {np.mean(arr)}')
print(f'Median: {np.median(arr)}')
print(f'Standard Deviation: {np.std(arr)}')
Mean: 3.0 Median: 3.0 Standard Deviation: 1.4142135623730951
# reshape
arr = np.array([1, 2, 3, 4, 5, 6])
new_arr = arr.reshape(2, 3)
print(new_arr)
[[1 2 3] [4 5 6]]
Pandas [Data analysis library]¶
Data reading, exploration and writing¶
| Command | Description |
|---|---|
pd.read_csv(data) |
Read file |
pd.Series(data) |
Create a Series |
df.info() |
Get DataFrame info |
df.describe() |
Summary statistics |
pd.DataFrame(data) |
Create a DataFrame |
df['column_name'] |
Select a single column |
df[['col1', 'col2']] |
Select multiple columns |
df.loc[row_label] |
Select a row by label |
df.iloc[row_index] |
Select a row by index |
df.head(n) |
Display the first n rows |
df.tail(n) |
Display the last n rows |
df.shape |
Get the shape (rows, columns) |
df.to_csv() |
Save data |
Data Manipulation¶
| Command | Description |
|---|---|
df['column_name'] |
Select a single column |
df[['col1', 'col2']] |
Select multiple columns |
df.loc[row_label] |
Select a row by label |
df.iloc[row_index] |
Select a row by index |
df.drop(labels, axis=0) |
Drop rows/columns |
df.groupby('column').mean() |
Group by and aggregate |
df.sort_values('column') |
Sort by column |
df.pivot_table() |
Create a pivot table |
Data complex Filtering¶
| Command | Description |
|---|---|
df[df['col'] > value] |
Boolean indexing |
df.query('expression') |
Query by expression |
df[(df['col1'] > val1) & (df['col2'] < val2)] |
Multiple conditions |
Data Cleaning¶
| Command | Description |
|---|---|
df.rename(columns={'old': 'new'}) |
Rename columns |
df.drop_duplicates() |
Remove duplicates |
df.set_index('column_name') |
Set column as index |
df.reset_index() |
Reset index |
df.fillna(value) |
Fill missing values |
# create pandas dataframe
# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'Age': [25, 30, 35, 40, 45],
'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
'Salary': [50000, 60000, 75000, 80000, 55000]}
df = pd.DataFrame(data)
df
| Name | Age | Gender | Salary | |
|---|---|---|---|---|
| 0 | Alice | 25 | Female | 50000 |
| 1 | Bob | 30 | Male | 60000 |
| 2 | Charlie | 35 | Male | 75000 |
| 3 | David | 40 | Male | 80000 |
| 4 | Eva | 45 | Female | 55000 |
# set up working directory
import os # os is a built-in library which provides functions for interacting with the operating system
os.getcwd() # get current working directory
os.chdir('E:/my works/prensentation/data') # change working directory
# read csv data
import pandas as pd # import pandas library as pd
df = pd.read_csv('iris.csv') # read csv file
# check data type
df.dtypes
Sepal.Length float64 Sepal.Width float64 Petal.Length float64 Petal.Width float64 Species object dtype: object
# view first 5 rows
df.head(5)
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# Descriptive Statistics
df.describe()
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
|---|---|---|---|---|
| count | 150.000000 | 150.000000 | 149.000000 | 150.000000 |
| mean | 5.843333 | 3.057333 | 3.773154 | 1.199333 |
| std | 0.828066 | 0.435866 | 1.793104 | 0.762238 |
| min | 4.300000 | 2.000000 | 0.000000 | 0.100000 |
| 25% | 5.100000 | 2.800000 | 1.500000 | 0.300000 |
| 50% | 5.800000 | 3.000000 | 4.400000 | 1.300000 |
| 75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
| max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
Cleaning¶
# check missing values
df.isnull().sum()
Sepal.Length 0 Sepal.Width 0 Petal.Length 1 Petal.Width 0 Species 0 dtype: int64
# fill missing values with mean
df.fillna(df.mean(), inplace=True)
C:\Users\Hafez\AppData\Local\Temp\ipykernel_2188\2806775267.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. df.fillna(df.mean(), inplace=True)
# check missing values again
df.isnull().sum()
Sepal.Length 0 Sepal.Width 0 Petal.Length 0 Petal.Width 0 Species 0 dtype: int64
# any duplicate rows?
df.duplicated().sum()
1
# drop duplicate rows
df.drop_duplicates(inplace=True)
# save cleaned data
df.to_csv('iris_cleaned.csv', index=False) # index=False means do not save index
# apply function to a column
df['sepal_length'] = df['sepal_length'].apply(lambda x: x * 2)
# Make
# Define a custom function to calculate the total
def calculate_total(row):
return row['sepal_length'] * row['sepal_length']
# Apply the function to each row
df['Total'] = df.apply(calculate_total, axis=1)
# calculate correlation between only numeric columns
df.select_dtypes(include=['float64', 'int64']).corr()
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
|---|---|---|---|---|
| Sepal.Length | 1.000000 | -0.117570 | 0.856971 | 0.817941 |
| Sepal.Width | -0.117570 | 1.000000 | -0.412193 | -0.366126 |
| Petal.Length | 0.856971 | -0.412193 | 1.000000 | 0.939200 |
| Petal.Width | 0.817941 | -0.366126 | 0.939200 | 1.000000 |
# make pivot table
df.pivot_table(index='Species', values='Sepal.Length', aggfunc='mean')
| Sepal.Length | |
|---|---|
| Species | |
| Setosa | 5.100000 |
| seTosa | 5.000000 |
| setosa | 5.004167 |
| versicolor | 5.936000 |
| virginica | 6.588000 |
# group by
df.groupby('Species')['Sepal.Length'].mean()
Species Setosa 5.100000 seTosa 5.000000 setosa 5.004167 versicolor 5.936000 virginica 6.588000 Name: Sepal.Length, dtype: float64
# calculate group by mean , median, standard deviation, min, max
df.groupby('Species')['Sepal.Length'].agg(['mean', 'median', 'std', 'min', 'max'])
| mean | median | std | min | max | |
|---|---|---|---|---|---|
| Species | |||||
| Setosa | 5.100000 | 5.1 | NaN | 5.1 | 5.1 |
| seTosa | 5.000000 | 5.0 | NaN | 5.0 | 5.0 |
| setosa | 5.004167 | 5.0 | 0.359644 | 4.3 | 5.8 |
| versicolor | 5.936000 | 5.9 | 0.516171 | 4.9 | 7.0 |
| virginica | 6.588000 | 6.5 | 0.635880 | 4.9 | 7.9 |
# make Species all lower case
df['Species'] = df['Species'].str.lower()
# calculate group by mean , median, standard deviation, min, max
df.groupby('Species')['Sepal.Length'].agg(['mean', 'median', 'std', 'min', 'max'])
| mean | median | std | min | max | |
|---|---|---|---|---|---|
| Species | |||||
| setosa | 5.006122 | 5.0 | 0.356141 | 4.3 | 5.8 |
| versicolor | 5.936000 | 5.9 | 0.516171 | 4.9 | 7.0 |
| virginica | 6.588000 | 6.5 | 0.635880 | 4.9 | 7.9 |
# select columns /Indexing
df[['Species', 'Sepal.Length']].head(5)
| Species | Sepal.Length | |
|---|---|---|
| 0 | setosa | 5.1 |
| 1 | setosa | 4.9 |
| 2 | setosa | 4.7 |
| 3 | setosa | 4.6 |
| 4 | setosa | 5.0 |
# use loc to select rows and columns or use iloc to select rows and columns by index
df.loc[0:5, ['Species', 'Sepal.Length']] # select first 5 rows and columns 'Species' and 'Sepal.Length'
| Species | Sepal.Length | |
|---|---|---|
| 0 | setosa | 5.1 |
| 1 | setosa | 4.9 |
| 2 | setosa | 4.7 |
| 3 | setosa | 4.6 |
| 4 | setosa | 5.0 |
| 5 | setosa | 5.4 |
# use iloc to select rows and columns by index
df.iloc[0:5, 0:2] # select first 5 rows and first 2 columns
| Sepal.Length | Sepal.Width | |
|---|---|---|
| 0 | 5.1 | 3.5 |
| 1 | 4.9 | 3.0 |
| 2 | 4.7 | 3.2 |
| 3 | 4.6 | 3.1 |
| 4 | 5.0 | 3.6 |
# multiple conditions
df[(df['Species'] == 'setosa') & (df['Sepal.Length'] > 5.4)]
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
|---|---|---|---|---|---|
| 14 | 5.8 | 4.0 | 1.2 | 0.2 | setosa |
| 15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
| 18 | 5.7 | 3.8 | 1.7 | 0.3 | setosa |
| 33 | 5.5 | 4.2 | 1.4 | 0.2 | setosa |
| 36 | 5.5 | 3.5 | 1.3 | 0.2 | setosa |
# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'Age': [25, 30, 35, 40, 45],
'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
'Salary': [50000, 60000, 75000, 80000, 55000]}
data= pd.DataFrame(data)
data
| Name | Age | Gender | Salary | |
|---|---|---|---|---|
| 0 | Alice | 25 | Female | 50000 |
| 1 | Bob | 30 | Male | 60000 |
| 2 | Charlie | 35 | Male | 75000 |
| 3 | David | 40 | Male | 80000 |
| 4 | Eva | 45 | Female | 55000 |
pandas long query chain¶
adding a "Salary_Increase" column with a 10% salary raise, sorting the DataFrame by age in descending order , filtering to include only males, dropping the "Gender" column, renaming columns , replacing gender values, converting "Full Name" values to uppercase , and calculating yearly salaries. The final output, stored in the 'result' DataFrame.
# Chain of Pandas operations
result = (
data
.assign(Salary_Increase=lambda x: x['Salary'] * 1.1) # Add a new column for salary increase
.sort_values(by='Age', ascending=False) # Sort by age in descending order
.query('Gender == "Male"') # Filter only males
.drop(columns=['Gender']) # Drop the gender column
.reset_index(drop=True) # Reset the index
.rename(columns={'Name': 'Full Name', 'Age': 'Employee Age'}) # Rename columns
.loc[:, ['Full Name', 'Employee Age', 'Salary', 'Salary_Increase']] # Select specific columns
.replace({'Male': 'M', 'Female': 'F'}) # Replace gender values
.apply(lambda x: x.str.upper() if x.name == 'Full Name' else x) # Uppercase Full Name
.assign(Yearly_Salary=lambda x: x['Salary_Increase'] * 12) # Calculate yearly salary
)
result
Name Age Gender Salary 0 Alice 25 Female 50000 1 Bob 30 Male 60000 2 Charlie 35 Male 75000 3 David 40 Male 80000 4 Eva 45 Female 55000
| Full Name | Employee Age | Salary | Salary_Increase | Yearly_Salary | |
|---|---|---|---|---|---|
| 0 | DAVID | 40 | 80000 | 88000.0 | 1056000.0 |
| 1 | CHARLIE | 35 | 75000 | 82500.0 | 990000.0 |
| 2 | BOB | 30 | 60000 | 66000.0 | 792000.0 |
df.columns
Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
'Species'],
dtype='object')
# Chain of Pandas operations
result = (
df
.assign(Sepal_Area=lambda x: x['Sepal.Length'] * x['Sepal.Width']) # Add a new column for sepal area
.assign(Petal_Area=lambda x: x['Petal.Length'] * x['Petal.Width']) # Add a new column for petal area
.sort_values(by='Sepal.Length', ascending=False) # Sort by sepal length in descending order
.query('Species == "versicolor"') # Filter rows where species is versicolor
.drop(columns=['Species']) # Drop the species column
.reset_index(drop=True) # Reset the index
.rename(columns={'Sepal.Length': 'Sepal Length', 'Sepal.Width': 'Sepal Width',
'Petal.Length': 'Petal Length', 'Petal.Width': 'Petal Width',
'Sepal_Area': 'Sepal Area', 'Petal_Area': 'Petal Area'}) # Rename columns
.loc[:, ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Sepal Area', 'Petal Area']] # Select specific columns
.apply(lambda x: x.round(2) if x.name != 'Sepal Width' else x) # Round values to 2 decimal places (except Sepal Width)
.apply(lambda x: x.str.upper() if x.name == 'Full Name' else x) # Uppercase Full Name
# filter rows where Sepal Length is greater than 6.5 and Sepal Width is greater than 3.04
.query('`Sepal Length` > 6.5 & `Sepal Width` > 3.04')
)
result.head()
| Sepal Length | Sepal Width | Petal Length | Petal Width | Sepal Area | Petal Area | |
|---|---|---|---|---|---|---|
| 0 | 7.0 | 3.2 | 4.7 | 1.4 | 22.40 | 6.58 |
| 1 | 6.9 | 3.1 | 4.9 | 1.5 | 21.39 | 7.35 |
| 3 | 6.7 | 3.1 | 4.7 | 1.5 | 20.77 | 7.05 |
| 5 | 6.7 | 3.1 | 4.4 | 1.4 | 20.77 | 6.16 |
# rename columns
df.rename(columns={'Sepal.Length': 'Sepal_Length'}, inplace=True)
# setosa df1 and versicolor df2
df1 = df[df['Species'] == 'setosa']
df2 = df[df['Species'] == 'versicolor']
# merge df1 and df2
df3 = pd.concat([df1, df2], axis=0)

# merge df1 and df2 by columns
df4=pd.merge(df1, df2, on='Species', how='inner')
# generate multiple .txt files with some data with common id column and sst colum with ramdom numbers
import random
# Loop to generate and write data to multiple .txt files
for i in range(1, 11):
# Generate random id and sst values
id = random.randint(1, 100)
sst = random.randint(1, 100)
# Create and open the .txt file for writing
with open(f'{id}.txt', 'w') as f:
# Write the header line with id and sst values
f.write(f'id,sst\n')
# Write some data (you can modify this part as needed)
for j in range(10): # Writing 10 lines of random data as an example
data1 = random.uniform(0, 10)
data2 = random.uniform(0, 10)
f.write(f'{data1},{data2}\n')
# Import any necessary libraries
import os
# Specify the directory where your files are located
directory = 'E:/my works/prensentation/data'
# Get a list of file names in the directory
file_names = os.listdir(directory)
# Create an empty list to store the contents of the files
data_frames = []
# Loop through each file and read it into a DataFrame
for file_name in file_names:
if file_name.endswith('.txt'): # Check if it's a text file
file_path = os.path.join(directory, file_name)
df = pd.read_csv(file_path, delimiter=',') # Adjust the delimiter if needed
data_frames.append(df)
# Concatenate the DataFrames into one
combined_df = pd.concat(data_frames, ignore_index=True)
combined_df.head()
| 12 | 91 | 15 | 28 | 16 | 19 | 17 | 93 | 24 | 13 | id | sst | 59 | 86 | 69 | 35 | 8 | 83 | 56 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.510590 | 7.809940 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7.798259 | 8.325629 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.488465 | 1.036125 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.336252 | 5.043532 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7.459540 | 1.735971 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3. Data Visualization and Real-World Projects (Day 3)¶
| Matplotlib Function | Description |
|---|---|
plt.plot(x, y) |
Plot a line graph with data points x and y. |
plt.scatter(x, y) |
Create a scatter plot with data points x and y. |
plt.bar(x, height) |
Generate a bar chart with values x and height. |
plt.hist(data, bins) |
Create a histogram with the given data and bins. |
plt.xlabel('xlabel') |
Set the label for the x-axis. |
plt.ylabel('ylabel') |
Set the label for the y-axis. |
plt.title('title') |
Add a title to the plot. |
plt.legend() |
Display a legend for labeled elements on the plot. |
plt.grid(True) |
Enable gridlines on the plot. |
plt.savefig('filename.png') |
Save the current plot as an image file. |
import matplotlib.pyplot as plt # add matplotlib.pyplot library as plt
# Line Plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.figure(figsize=(8, 4)) # set figure size
plt.plot(x, y, label='sin(x)') # plot x and y
plt.xlabel('x') # set x label
plt.ylabel('y') # set y label
plt.title('Line Plot') # set title
plt.legend() # show legend
plt.grid(True) # show grid
plt.show() # show plot
# Scatter Plot
x = np.random.rand(50)
y = np.random.rand(50)
plt.figure(figsize=(6, 6))
plt.scatter(x, y, color='b', marker='o', label='Random Data')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Scatter Plot')
plt.legend()
plt.grid(True)
plt.show()
# Histogram
data = np.random.randn(1000)
plt.figure(figsize=(8, 4))
plt.hist(data, bins=20, color='g', alpha=0.6, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.grid(True)
plt.show()
# Bar Chart
categories = ['A', 'B', 'C', 'D', 'E']
values = [10, 15, 7, 12, 9]
plt.figure(figsize=(8, 4))
plt.bar(categories, values, color='r', alpha=0.7)
plt.xlabel('Categories')
plt.ylabel('Values')
plt.title('Bar Chart')
plt.grid(axis='y')
plt.show()
# Pie Chart
labels = ['A', 'B', 'C', 'D']
sizes = [15, 30, 45, 10]
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, shadow=True)
plt.title('Pie Chart')
plt.show()
# Box Plot
data = [np.random.normal(0, std, 100) for std in range(1, 5)]
plt.figure(figsize=(8, 4))
plt.boxplot(data, vert=False, labels=['A', 'B', 'C', 'D'])
plt.xlabel('Value')
plt.title('Box Plot')
plt.grid(True)
plt.show()
# Heatmap
# Heatmap with a smaller colorbar
data = np.random.rand(5, 5)
plt.figure(figsize=(6, 6))
heatmap = plt.imshow(data, cmap='coolwarm', interpolation='nearest')
colorbar = plt.colorbar(heatmap, shrink=0.8) # Adjust the shrink value as needed
plt.title('Heatmap')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# Generate x values
x = np.linspace(-5, 5, 100) # Creates 100 evenly spaced points between -5 and 5
# Compute y values using the equation y = x^2
y = x**2
# Create the plot
plt.figure(figsize=(8, 4))
plt.plot(x, y, label='y = x^2', color='blue')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Plot of y = x^2')
plt.legend()
plt.grid(True)
plt.show()
# Creating subplots of df
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
df['Sepal.Length'].plot(ax=axes[0, 0], color='r', title='Sepal Length')
df['Sepal.Width'].plot(ax=axes[0, 1], linestyle='--', marker='o', color='g', label='Data', title='Sepal Width')
df['Petal.Length'].plot(ax=axes[1, 0], color='b', title='Petal Length', legend=True)
df['Petal.Width'].plot(ax=axes[1, 1], color='y', title='Petal Width', legend=True, grid=True,label='Petal Width')
# add equation to in axes[1, 1]
axes[1, 1].text(30, 1.5, '$y = \sqrt{x^2+1}$', fontsize=20, horizontalalignment='center', verticalalignment='center')
# save plot
#plt.savefig('subplots.png', dpi=300, bbox_inches='tight')
Text(30, 1.5, '$y = \\sqrt{x^2+1}$')
import matplotlib.gridspec as gridspec
# Create a figure and gridspec with unequal width- and height-ratios
fig = plt.figure(figsize=(8, 6))
gs = gridspec.GridSpec(2, 2, width_ratios=[2, 1], height_ratios=[1, 2])
# Add plots to the subplots
ax0 = plt.subplot(gs[0, 0])
N = 10
data = (np.geomspace(1, 10, 100) + np.random.randn(N, 100)).T
ax0.plot(data)
ax0.set_title('Plot 1')
ax1 = plt.subplot(gs[0, 1])
ax1.hist(np.random.randn(1000), bins=20, color='g', alpha=0.6, edgecolor='black')
ax1.set_title('Plot 2')
ax2 = plt.subplot(gs[1, 0])
x = np.linspace(0.1, 2 * np.pi, 41)
y = np.exp(np.sin(x))
ax2.stem(x, y)
ax2.set_title('Plot 3')
ax3 = plt.subplot(gs[1, 1])
ax3.bar(['A', 'B', 'C', 'D'], [10, 20, 30, 40], color='y', alpha=0.7)
ax3.set_title('Plot 4')
# Adjust layout
plt.tight_layout()
# Show the figure
plt.show()
Statistical test¶
from scipy import stats
# Generate 1000 random numbers from a normal distribution with mean 0 and standard deviation 1
data = np.random.randn(1000)
# group1
group1 = data[:500]
# group2
group2 = data[500:]
# group3
group3 = np.random.randn(500)
# Independent t-test
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f'T-statistic: {t_stat}, P-value: {p_value}')
T-statistic: 0.31639492837773037, P-value: 0.7517689292338926
# One-way ANOVA
f_stat, p_value = stats.f_oneway(group1, group2, group3)
print(f'F-statistic: {f_stat}, P-value: {p_value}')
F-statistic: 0.05427300180328578, P-value: 0.9471753547403352
# Chi-square test for independence
observed_data = np.array([[10, 10, 20], [20, 20, 20]])
chi2_stat, p_value, dof, expected = stats.chi2_contingency(observed_data)
print(f'Chi2 statistic: {chi2_stat}, P-value: {p_value}, Degrees of freedom: {dof}')
Chi2 statistic: 2.7777777777777777, P-value: 0.24935220877729622, Degrees of freedom: 2
import statsmodels.api as sm
# remove missing values
df.dropna(inplace=True)
y=df['Sepal.Length'] # dependent variable
X=df[['Sepal.Width','Petal.Length','Petal.Width']] # independent variables
# Simple linear regression
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Sepal.Length R-squared: 0.806
Model: OLS Adj. R-squared: 0.802
Method: Least Squares F-statistic: 201.4
Date: Wed, 20 Sep 2023 Prob (F-statistic): 1.69e-51
Time: 23:00:04 Log-Likelihood: -60.431
No. Observations: 149 AIC: 128.9
Df Residuals: 145 BIC: 140.9
Df Model: 3
Covariance Type: nonrobust
================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------
const 2.4966 0.276 9.055 0.000 1.952 3.042
Sepal.Width 0.5380 0.076 7.071 0.000 0.388 0.688
Petal.Length 0.4576 0.053 8.703 0.000 0.354 0.562
Petal.Width -0.0160 0.122 -0.132 0.895 -0.256 0.224
==============================================================================
Omnibus: 6.859 Durbin-Watson: 2.011
Prob(Omnibus): 0.032 Jarque-Bera (JB): 8.525
Skew: -0.286 Prob(JB): 0.0141
Kurtosis: 4.023 Cond. No. 50.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.linear_model import LinearRegression
y = df['Sepal.Length'] # dependent variable
X = df[['Sepal.Width', 'Petal.Length', 'Petal.Width']] # independent variables
model = LinearRegression()
model.fit(X, y)
print(f'Intercept: {model.intercept_}')
print(f'Coefficients: {model.coef_}')
Intercept: 2.4965574039645433 Coefficients: [ 0.53803901 0.4575952 -0.01601361]
Advanced plotting with seaborn¶
| Seaborn Function | Description |
|---|---|
sns.scatterplot(x, y, data) |
Create a scatter plot with two numeric variables |
sns.lineplot(x, y, data) |
Create a line plot with two numeric variables |
sns.barplot(x, y, data) |
Create a bar plot with a categorical and numeric variable |
sns.countplot(x, data) |
Create a count plot for categorical data |
sns.boxplot(x, y, data) |
Create a box plot to visualize data distribution |
sns.violinplot(x, y, data) |
Create a violin plot to show data distribution |
sns.heatmap(data) |
Create a heatmap to visualize data correlations |
sns.pairplot(data) |
Create a pair plot for multiple variables |
sns.distplot(data) |
Create a distribution plot for a numeric variable |
sns.lmplot(x, y, data) |
Create a linear regression plot |
sns.catplot(x, y, data) |
Create a categorical plot |
Advanced statistical plot¶
import seaborn as sns
# Define custom colors for each species
colors = {'setosa': 'blue', 'versicolor': 'green', 'virginica': 'red'}
# Plot
g = sns.lmplot(data=df,
x='Sepal.Width',
y='Sepal.Length',
hue='Species',
palette=colors, # Specify custom colors here
fit_reg=True,
legend=True,
facet_kws={'legend_out': True})
# Calculate and display correlation for each species
species_groups = df.groupby('Species')
for species_name, species_group in species_groups:
corr = species_group['Sepal.Width'].corr(species_group['Sepal.Length'])
color = colors[species_name] # Get the color for the species
plt.text(species_group['Sepal.Width'].mean()+0.5, species_group['Sepal.Length'].mean()+0.5, f'Corr: {corr:.2f}', fontsize=12, color=color)
g.set_axis_labels('Sepal Width', 'Sepal Length')
# Show the plot
plt.show()
# save plot
#plt.savefig('seaborn.png', dpi=300, bbox_inches='tight')