print('This is my first python code')

This is my first python code

name= ' your name'
print(name)

 your name

# Variables and Data Types:
name = "John" # name is a variable of type string
age = 30 # age is a variable of type integer
height = 6.2 # height is a variable of type float
is_student = False # is_student is a variable of type boolean

print(name)
print(age)
print(height)
print(is_student)

John
30
6.2
False

print(type(name))

<class 'str'>

# show the type of each variable
print(type(name))
print(type(age))
print(type(height))
print(type(is_student))

<class 'str'>
<class 'int'>
<class 'float'>
<class 'bool'>

# print all variables in one line
print(name, age, height, is_student)
# or
print(name + " " + str(age) + " " + str(height) + " " + str(is_student))
# or f-string
print(f"{name} {age} {height} {is_student}")

John 30 6.2 False
John 30 6.2 False
John 30 6.2 False

print('your name','30')

your name 30

30*10

300

# Arithmetic Operators
x = 10
y = 5

addition_result = x + y
print("Addition:", addition_result)  # Output: 15

subtraction_result = x - y
print("Subtraction:", subtraction_result)  # Output: 5

multiplication_result = x * y
print("Multiplication:", multiplication_result)  # Output: 50

division_result = x / y
print("Division:", division_result)  # Output: 2.0

exponentiation_result = x ** y
print("Exponentiation:", exponentiation_result)  # Output: 100000

Addition: 15
Subtraction: 5
Multiplication: 50
Division: 2.0
Exponentiation: 100000

# Arithmetic Operators
x = 10
y = 5

addition_result = x + y
print("Addition:", addition_result)  # Output: 15

subtraction_result = x - y
print("Subtraction:", subtraction_result)  # Output: 5

multiplication_result = x * y
print("Multiplication:", multiplication_result)  # Output: 50

division_result = x / y
print("Division:", division_result)  # Output: 2.0

exponentiation_result = x ** y
print("Exponentiation:", exponentiation_result)  # Output: 100000

floor_division_result = x // y
print("Floor Division:", floor_division_result)  # Output: 2

# Bitwise Operators
a = 5  # Binary: 101
b = 3  # Binary: 011

bitwise_and_result = a & b
print("Bitwise AND:", bitwise_and_result)  # Output: 1 (Binary: 001)

bitwise_or_result = a | b
print("Bitwise OR:", bitwise_or_result)  # Output: 7 (Binary: 111)

bitwise_xor_result = a ^ b
print("Bitwise XOR:", bitwise_xor_result)  # Output: 6 (Binary: 110)

bitwise_left_shift_result = a << 1
print("Bitwise Left Shift:", bitwise_left_shift_result)  # Output: 10 (Binary: 1010)

bitwise_right_shift_result = a >> 1
print("Bitwise Right Shift:", bitwise_right_shift_result)  # Output: 2 (Binary: 10)

# Comparison Operators
x = 10
y = 5

less_than_or_equal = x <= y
print("Less than or equal to:", less_than_or_equal)  # Output: False

greater_than = x > y
print("Greater than:", greater_than)  # Output: True

greater_than_or_equal = x >= y
print("Greater than or equal to:", greater_than_or_equal)  # Output: True

not_equal = x != y
print("Not equal to:", not_equal)  # Output: True

equal_to = x == y
print("Equal to:", equal_to)  # Output: False

# Assignment Expression
if (n := len("hello")) > 4:
    print(f"The length of 'hello' is {n}.")  # Output: "The length of 'hello' is 5."

# Matrix Multiplication Operator
import numpy as np

matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])

matrix_product = matrix_a @ matrix_b
print("Matrix Product:")
print(matrix_product)

Addition: 15
Subtraction: 5
Multiplication: 50
Division: 2.0
Exponentiation: 100000
Floor Division: 2
Bitwise AND: 1
Bitwise OR: 7
Bitwise XOR: 6
Bitwise Left Shift: 10
Bitwise Right Shift: 2
Less than or equal to: False
Greater than: True
Greater than or equal to: True
Not equal to: True
Equal to: False
The length of 'hello' is 5.
Matrix Product:
[[19 22]
 [43 50]]

# list
fruits = ["apple", "banana", "cherry"]
print(fruits)

['apple', 'banana', 'cherry']

mix_list=['apple', 3, 'banana', 4, 'cherry', 5]
print(mix_list)

['apple', 3, 'banana', 4, 'cherry', 5]

# check data type
print(type(fruits))

<class 'list'>

#Dictionaries:
person = {"name": "Alice", "age": 25, "city": "New York"}
print(type(person))
print(person["name"])  # Accessing values
person["occupation"] = "Engineer"  # Adding key-value pairs

<class 'dict'>
Alice

person.keys()  # Accessing keys

dict_keys(['name', 'age', 'city', 'occupation'])

person.values()  # Accessing values

dict_values(['Alice', 25, 'New York', 'Engineer'])

# Add function from built-in library or third-party library
from math import sqrt
print(sqrt(25))

5.0

# list of numbers
numbers = [1, 4, 9]
# square root of each number
roots = [sqrt(n) for n in numbers]
print(roots)

[1.0, 2.0, 3.0]

# Define your own function 
def my_square(n):
    return n ** 2
print(my_square(5))

25

# or
def greet(name):
    return "Hello, " + name + "!"

result = greet("Bob")
print(result)

Hello, Bob!

# Creating a sample list
# here 1=0, 7=1,3=2
my_list = [1, 7, 3, 4, 5]
# Accessing an element by index
my_list[1]

7

# Slicing
my_list[1:4]

[7, 3, 4]

# Accessing the last element
my_list[-1]

5

# Adding an item at the end
my_list.append(11)
my_list

[1, 7, 3, 4, 5, 11]

# Inserting an item at an index
my_list.insert(2,88)  # 2 is the index, 7 is the value
my_list

[1, 7, 88, 3, 4, 5, 11]

# Removing an item by value
my_list.remove(4)
my_list

[1, 7, 88, 3, 5, 11]

# Creating a sample dictionary
my_dict = {'name': 'John', 'age': 30, 'city': 'New York'}
my_dict # keys and values

{'name': 'John', 'age': 30, 'city': 'New York'}

# print all keys
my_dict.keys()

dict_keys(['name', 'age', 'city'])

# print all values
my_dict.values()

dict_values(['John', 30, 'New York'])

# Accessing a value by key
my_dict['age']

30

# Adding a key-value pair
my_dict['job'] = 'Engineer'

my_dict

{'name': 'John', 'age': 30, 'city': 'New York', 'job': 'Engineer'}

# Removing a key-value pair
del my_dict['city']
my_dict

{'name': 'John', 'age': 30, 'job': 'Engineer'}

# merge two dictionaries
my_dict1 = {'name': 'John', 'age': 30}
my_dict2 = {'city': 'New York', 'job': 'Engineer'}
my_dict1.update(my_dict2)
my_dict1

{'name': 'John', 'age': 30, 'city': 'New York', 'job': 'Engineer'}

# merge two lists
my_list1 = [1, 2, 3]
my_list2 = [4, 5, 6]
my_list1.extend(my_list2)
my_list1

[1, 2, 3, 4, 5, 6]

# add two lists with different length,differnt data type
my_list1 = [1, 2, 3]
my_list2 = [4, 5, 6, 7]
# string
my_list3 = ['a', 'b', 'c']

my_list1 + my_list2 # numbers
my_list1 + my_list3 # numbers and string

[1, 2, 3, 'a', 'b', 'c']

my_list1 + my_list2 # numbers

[1, 2, 3, 4, 5, 6, 7]

my_list1 + my_list3

[1, 2, 3, 'a', 'b', 'c']

#Conditional Statements (if-else):
x = 5
if x > 5:
    print("x is greater than 5")
elif x == 5:
    print("x is equal to 5")
else:
    print("x is not greater than 5")

x is equal to 5

# Example using a for loop
fruits = ["apple", "banana", "cherry"]
for item in fruits:
    print(item)

apple
banana
cherry

# alternative way in one line ; list comprehension : [expression for item in list]
[print(fruit) for fruit in fruits]

apple
banana
cherry

[None, None, None]

# Example using a for loop with range: range is a built-in function that returns a sequence of numbers
for i in range(1, 6):
    print(i)
# alternative way in one line
[i for i in range(1, 6)]

[1, 2, 3, 4, 5]

# Example using a while loop
count = 0 # initialize count to 0
while count < 6: # condition:  continue to run until condition is false
    print(count) # print count
    count += 1 # increment count by 1
    # this is equivalent to count = count + 1, it will keep adding 1 to count until count = 5

# Example using a while loop with break and continue
num = 0
while num < 10:
    num += 1
    if num == 5:
        continue  # Skip iteration when num is 5 which means it will not print 5
    print(num)
    if num == 8:
        break  # Exit the loop when num is 8 which means it will not print 9 and 10

import datetime
datetime.datetime.now()

datetime.datetime(2024, 1, 8, 8, 29, 29, 628952)

# datetime 
date='2020-01-01'
date = datetime.datetime.strptime(date, '%Y-%m-%d')
print(date)
print(type(date))

2020-01-01 00:00:00
<class 'datetime.datetime'>

# get month, day, year
print(date.month)
print(date.day)
print(date.year)

1
1
2020

def add_numbers(x, y):
    """
    Adds two numbers and returns the result.
    
    Parameters:
    - x (int or float): The first number.
    - y (int or float): The second number.
    
    Returns:
    - float: The sum of x and y.
    """
    result = x + y
    return result
# call function
add_numbers(5, 10)

15

import math
def calculate_factorial(n):
    """
    Calculate the factorial of a given number.
    """
    return math.factorial(n)
print(calculate_factorial(5))

120

def factorial(n):
    """
    Calculates the factorial of a number using recursion.
    """
    if n == 0 or n == 1:
        return 1
    else:
        return n * factorial(n-1)

# Call the function
factorial_result = factorial(5)
print(f"The factorial is: {factorial_result}")

The factorial is: 120

def add_numbers(a, b):
    """Adds two numbers."""
    return a + b

result = add_numbers(3, 5)
print(result)

add = lambda x, y: x + y
result = add(3, 5)

print(result)

8

def calculator(operation):
    def add(x, y):
        return x + y
    
    def subtract(x, y):
        return x - y
    
    if operation == "add":
        return add
    elif operation == "subtract":
        return subtract
    else:
        return None

add_function = calculator("add")
result_add = add_function(3, 4)

subtract_function = calculator("subtract")
result_subtract = subtract_function(7, 2)

print(result_add)      # Output: 7
print(result_subtract) # Output: 5

7
5

def read_and_print_file(file_path):
    """
    Reads and prints lines from a text file.
    """
    try:
        with open(file_path, 'r') as file:
            for line in file:
                print(line.strip())
    except FileNotFoundError:
        print(f"File not found: {file_path}")

# Test the function
read_and_print_file('example.txt')
# Assumes 'example.txt' contains some lines of text

File not found: example.txt

import numpy as np

arr = np.array([1, 2, 3, 4, 5])
mean_value = np.mean(arr)

print(mean_value)

3.0

#mymodule.py:
def greet(name):
    """Prints a greeting."""
    print(f"Hello, {name}!")

def square(x):
    """Returns the square of a number."""
    return x ** 2
# import mymodule

# mymodule.greet("Alice")  # Output: Hello, Alice!
# result = mymodule.square(4)
# print(result)  # Output: 16

numbers = [1, 2, 3, 4, 5]
# use of map without lambda
def square(x):
    return x**2
squared = map(square, numbers) # map(function, iterable)
print(list(squared))

[1, 4, 9, 16, 25]

# Using map to square a list of numbers
numbers = [1, 2, 3, 4, 5]
squared = map(lambda x: x**2, numbers) # map(function, iterable)
print(list(squared))

names = ["Alice", "Bob", "Charlie"]
scores = [95, 87, 92]

zipped_data = zip(names, scores) # zip(iterable1, iterable2)
zipped_list = list(zipped_data)
print(zipped_list)  #

[('Alice', 95), ('Bob', 87), ('Charlie', 92)]

numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Filter even numbers
even_numbers = filter(lambda x: x % 2 == 0, numbers) # filter(function, iterable)
even_list = list(even_numbers)
print(even_list)

[2, 4, 6, 8]

from functools import reduce

numbers = [1, 2, 3, 4, 5]
product = reduce(lambda x, y: x * y, numbers) # reduce(function, iterable)
print(product)

120

# simple example of regular expression
import re
pattern = r"Cookie"
sequence = "Cookie"
if re.match(pattern, sequence):
  print("Match!")

Match!

# example of regular expression
txt = "The rain in starkville is good"
x = re.split("\s", txt)
print(x)

['The', 'rain', 'in', 'starkville', 'is', 'good']

import re
# Sample text containing phone numbers
text = "Please contact us at 123-456-7890 or 555-555-5555 for assistance."

# Define a regular expression pattern for matching phone numbers
pattern = r'\d{3}-\d{3}-\d{4}'

# Use the findall method to find all matches in the text
phone_numbers = re.findall(pattern, text)
print(phone_numbers)

['123-456-7890', '555-555-5555']

# Sample text containing email addresses
text = "Please contact support@example.com for assistance or john.doe@email.co for more information."

# Define a regular expression pattern for matching email addresses
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

# Use the findall method to find all matches in the text
email_addresses = re.findall(pattern, text)

# Print the extracted email addresses
for email in email_addresses:
    print(email)

support@example.com
john.doe@email.co

# import built-in libraries
import math
import random
# importing pandas library,numpy library: Third-party libraries
import pandas as pd
import numpy as np

# add numpy library as np
import numpy as np

# Create a 1D array from a Python list
arr_1d = np.array([1, 2, 3, 4, 5])
print(arr_1d)

[1 2 3 4 5]

# shape of array
arr_1d.shape

(5,)

# Create a 2D array (matrix)
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(matrix)
# shape of matrix with fstring 
print(f"Shape: {matrix.shape}, Rows: {matrix.shape[0]}, Columns: {matrix.shape[1]}")

[[1 2 3]
 [4 5 6]
 [7 8 9]]
Shape: (3, 3), Rows: 3, Columns: 3

# Create an array of zeros
zeros = np.zeros((2, 3))  # Creates a 2x3 array of zeros

# Create an array of ones
ones = np.ones((3, 2))  # Creates a 3x2 array of ones

# Create an identity matrix
identity = np.eye(3)  # Creates a 3x3 identity matrix

print(f'Zeros:\n{zeros}\n')
print(f'Ones:\n{ones}\n')
print(f'Identity Matrix:\n{identity}\n')

Zeros:
[[0. 0. 0.]
 [0. 0. 0.]]

Ones:
[[1. 1.]
 [1. 1.]
 [1. 1.]]

Identity Matrix:
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]

# Element-wise addition
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
result = a + b
print(result)

[5 7 9]

# Matrix multiplication
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])
result_matrix = np.dot(matrix_a, matrix_b)
print(result_matrix)

[[19 22]
 [43 50]]

# Indexing
arr = np.array([10, 20, 30, 40, 50])
arr[2]  # Accessing the element at index

30

# Slicing
arr[1:4]  # Getting elements from index 1 to 3 (exclusive)

array([20, 30, 40])

# mean, median, standard deviation
arr = np.array([1, 2, 3, 4, 5])
print(f'Mean: {np.mean(arr)}')
print(f'Median: {np.median(arr)}')
print(f'Standard Deviation: {np.std(arr)}')

Mean: 3.0
Median: 3.0
Standard Deviation: 1.4142135623730951

# reshape
arr = np.array([1, 2, 3, 4, 5, 6])
new_arr = arr.reshape(2, 3)
print(new_arr)

[[1 2 3]
 [4 5 6]]

# create pandas dataframe
# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
        'Age': [25, 30, 35, 40, 45],
        'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
        'Salary': [50000, 60000, 75000, 80000, 55000]}

df = pd.DataFrame(data)
df

# set up working directory
import os # os is a built-in library which provides functions for interacting with the operating system
os.getcwd() # get current working directory
os.chdir('E:/my works/prensentation/data') # change working directory

# read csv data
import pandas as pd # import pandas library as pd
df = pd.read_csv('iris.csv') # read csv file

# check data type
df.dtypes

Sepal.Length    float64
Sepal.Width     float64
Petal.Length    float64
Petal.Width     float64
Species          object
dtype: object

# view first 5 rows
df.head(5)

# Descriptive Statistics
df.describe()

# check missing values
df.isnull().sum()

Sepal.Length    0
Sepal.Width     0
Petal.Length    1
Petal.Width     0
Species         0
dtype: int64

# fill missing values with mean
df.fillna(df.mean(), inplace=True)

C:\Users\Hafez\AppData\Local\Temp\ipykernel_2188\2806775267.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.fillna(df.mean(), inplace=True)

# check missing values again
df.isnull().sum()

Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
Species         0
dtype: int64

# any duplicate rows?
df.duplicated().sum()

1

# drop duplicate rows
df.drop_duplicates(inplace=True)

# save cleaned data
df.to_csv('iris_cleaned.csv', index=False) # index=False means do not save index

# apply function to a column 
df['sepal_length'] = df['sepal_length'].apply(lambda x: x * 2)
# Make 
# Define a custom function to calculate the total
def calculate_total(row):
    return row['sepal_length'] * row['sepal_length']
# Apply the function to each row
df['Total'] = df.apply(calculate_total, axis=1)

# calculate correlation between only numeric columns
df.select_dtypes(include=['float64', 'int64']).corr()

# make pivot table
df.pivot_table(index='Species', values='Sepal.Length', aggfunc='mean')

# group by
df.groupby('Species')['Sepal.Length'].mean()

Species
Setosa        5.100000
seTosa        5.000000
setosa        5.004167
versicolor    5.936000
virginica     6.588000
Name: Sepal.Length, dtype: float64

# calculate group by mean , median, standard deviation, min, max
df.groupby('Species')['Sepal.Length'].agg(['mean', 'median', 'std', 'min', 'max'])

# make Species all lower case
df['Species'] = df['Species'].str.lower()
# calculate group by mean , median, standard deviation, min, max
df.groupby('Species')['Sepal.Length'].agg(['mean', 'median', 'std', 'min', 'max'])

# select columns /Indexing
df[['Species', 'Sepal.Length']].head(5)

# use loc to select rows and columns or use iloc to select rows and columns by index
df.loc[0:5, ['Species', 'Sepal.Length']] # select first 5 rows and columns 'Species' and 'Sepal.Length'

# use iloc to select rows and columns by index
df.iloc[0:5, 0:2] # select first 5 rows and first 2 columns

# multiple conditions
df[(df['Species'] == 'setosa') & (df['Sepal.Length'] > 5.4)]

# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
        'Age': [25, 30, 35, 40, 45],
        'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
        'Salary': [50000, 60000, 75000, 80000, 55000]}

data= pd.DataFrame(data)
data

# Chain of Pandas operations
result = (
    data
    .assign(Salary_Increase=lambda x: x['Salary'] * 1.1)  # Add a new column for salary increase
    .sort_values(by='Age', ascending=False)  # Sort by age in descending order
    .query('Gender == "Male"')  # Filter only males
    .drop(columns=['Gender'])  # Drop the gender column
    .reset_index(drop=True)  # Reset the index
    .rename(columns={'Name': 'Full Name', 'Age': 'Employee Age'})  # Rename columns
    .loc[:, ['Full Name', 'Employee Age', 'Salary', 'Salary_Increase']]  # Select specific columns
    .replace({'Male': 'M', 'Female': 'F'})  # Replace gender values
    .apply(lambda x: x.str.upper() if x.name == 'Full Name' else x)  # Uppercase Full Name
    .assign(Yearly_Salary=lambda x: x['Salary_Increase'] * 12)  # Calculate yearly salary
)
result

      Name  Age  Gender  Salary
0    Alice   25  Female   50000
1      Bob   30    Male   60000
2  Charlie   35    Male   75000
3    David   40    Male   80000
4      Eva   45  Female   55000

df.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

# Chain of Pandas operations
result = (
   df
    .assign(Sepal_Area=lambda x: x['Sepal.Length'] * x['Sepal.Width'])  # Add a new column for sepal area
    .assign(Petal_Area=lambda x: x['Petal.Length'] * x['Petal.Width'])  # Add a new column for petal area
    .sort_values(by='Sepal.Length', ascending=False)  # Sort by sepal length in descending order
    .query('Species == "versicolor"')  # Filter rows where species is versicolor
    .drop(columns=['Species'])  # Drop the species column
    .reset_index(drop=True)  # Reset the index
    .rename(columns={'Sepal.Length': 'Sepal Length', 'Sepal.Width': 'Sepal Width',
                     'Petal.Length': 'Petal Length', 'Petal.Width': 'Petal Width',
                     'Sepal_Area': 'Sepal Area', 'Petal_Area': 'Petal Area'})  # Rename columns
   .loc[:, ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Sepal Area', 'Petal Area']]  # Select specific columns
   .apply(lambda x: x.round(2) if x.name != 'Sepal Width' else x)  # Round values to 2 decimal places (except Sepal Width)
   .apply(lambda x: x.str.upper() if x.name == 'Full Name' else x)  # Uppercase Full Name
   # filter rows where Sepal Length is greater than 6.5 and Sepal Width is greater than 3.04
    .query('`Sepal Length` > 6.5 & `Sepal Width` > 3.04')

)

result.head()

# rename columns
df.rename(columns={'Sepal.Length': 'Sepal_Length'}, inplace=True)

# setosa df1 and versicolor df2
df1 = df[df['Species'] == 'setosa']
df2 = df[df['Species'] == 'versicolor']
# merge df1 and df2
df3 = pd.concat([df1, df2], axis=0)

# merge df1 and df2 by columns
df4=pd.merge(df1, df2, on='Species', how='inner')

# generate multiple .txt files with some data with common id column and sst colum with ramdom numbers
import random
# Loop to generate and write data to multiple .txt files
for i in range(1, 11):
    # Generate random id and sst values
    id = random.randint(1, 100)
    sst = random.randint(1, 100)

    # Create and open the .txt file for writing
    with open(f'{id}.txt', 'w') as f:
        # Write the header line with id and sst values
        f.write(f'id,sst\n')

        # Write some data (you can modify this part as needed)
        for j in range(10):  # Writing 10 lines of random data as an example
            data1 = random.uniform(0, 10)
            data2 = random.uniform(0, 10)
            f.write(f'{data1},{data2}\n')

# Import any necessary libraries
import os

# Specify the directory where your files are located
directory = 'E:/my works/prensentation/data'

# Get a list of file names in the directory
file_names = os.listdir(directory)

# Create an empty list to store the contents of the files
data_frames = []

# Loop through each file and read it into a DataFrame
for file_name in file_names:
    if file_name.endswith('.txt'):  # Check if it's a text file
        file_path = os.path.join(directory, file_name)
        df = pd.read_csv(file_path, delimiter=',')  # Adjust the delimiter if needed
        data_frames.append(df)

# Concatenate the DataFrames into one
combined_df = pd.concat(data_frames, ignore_index=True)
combined_df.head()

import matplotlib.pyplot as plt # add matplotlib.pyplot library as plt
# Line Plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.figure(figsize=(8, 4)) # set figure size
plt.plot(x, y, label='sin(x)') # plot x and y
plt.xlabel('x') # set x label
plt.ylabel('y') # set y label
plt.title('Line Plot') # set title
plt.legend() # show legend
plt.grid(True) # show grid
plt.show() # show plot

# Scatter Plot
x = np.random.rand(50)
y = np.random.rand(50)
plt.figure(figsize=(6, 6))
plt.scatter(x, y, color='b', marker='o', label='Random Data')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Scatter Plot')
plt.legend()
plt.grid(True)
plt.show()

# Histogram
data = np.random.randn(1000)
plt.figure(figsize=(8, 4))
plt.hist(data, bins=20, color='g', alpha=0.6, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.grid(True)
plt.show()

# Bar Chart
categories = ['A', 'B', 'C', 'D', 'E']
values = [10, 15, 7, 12, 9]
plt.figure(figsize=(8, 4))
plt.bar(categories, values, color='r', alpha=0.7)
plt.xlabel('Categories')
plt.ylabel('Values')
plt.title('Bar Chart')
plt.grid(axis='y')
plt.show()

# Pie Chart
labels = ['A', 'B', 'C', 'D']
sizes = [15, 30, 45, 10]
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, shadow=True)
plt.title('Pie Chart')
plt.show()

# Box Plot
data = [np.random.normal(0, std, 100) for std in range(1, 5)]
plt.figure(figsize=(8, 4))
plt.boxplot(data, vert=False, labels=['A', 'B', 'C', 'D'])
plt.xlabel('Value')
plt.title('Box Plot')
plt.grid(True)
plt.show()

# Heatmap
# Heatmap with a smaller colorbar
data = np.random.rand(5, 5)
plt.figure(figsize=(6, 6))
heatmap = plt.imshow(data, cmap='coolwarm', interpolation='nearest')
colorbar = plt.colorbar(heatmap, shrink=0.8)  # Adjust the shrink value as needed
plt.title('Heatmap')
plt.show()

import matplotlib.pyplot as plt
import numpy as np

# Generate x values
x = np.linspace(-5, 5, 100)  # Creates 100 evenly spaced points between -5 and 5

# Compute y values using the equation y = x^2
y = x**2

# Create the plot
plt.figure(figsize=(8, 4))
plt.plot(x, y, label='y = x^2', color='blue')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Plot of y = x^2')
plt.legend()
plt.grid(True)
plt.show()

# Creating subplots of df
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
df['Sepal.Length'].plot(ax=axes[0, 0], color='r', title='Sepal Length')
df['Sepal.Width'].plot(ax=axes[0, 1], linestyle='--', marker='o', color='g', label='Data', title='Sepal Width')
df['Petal.Length'].plot(ax=axes[1, 0], color='b', title='Petal Length', legend=True)
df['Petal.Width'].plot(ax=axes[1, 1], color='y', title='Petal Width', legend=True, grid=True,label='Petal Width')
# add equation to in axes[1, 1]
axes[1, 1].text(30, 1.5, '$y = \sqrt{x^2+1}$', fontsize=20, horizontalalignment='center', verticalalignment='center')
# save plot
#plt.savefig('subplots.png', dpi=300, bbox_inches='tight')

Text(30, 1.5, '$y = \\sqrt{x^2+1}$')

import matplotlib.gridspec as gridspec

# Create a figure and gridspec with unequal width- and height-ratios
fig = plt.figure(figsize=(8, 6))
gs = gridspec.GridSpec(2, 2, width_ratios=[2, 1], height_ratios=[1, 2])

# Add plots to the subplots
ax0 = plt.subplot(gs[0, 0])
N = 10
data = (np.geomspace(1, 10, 100) + np.random.randn(N, 100)).T

ax0.plot(data)
ax0.set_title('Plot 1')

ax1 = plt.subplot(gs[0, 1])
ax1.hist(np.random.randn(1000), bins=20, color='g', alpha=0.6, edgecolor='black')
ax1.set_title('Plot 2')

ax2 = plt.subplot(gs[1, 0])
x = np.linspace(0.1, 2 * np.pi, 41)
y = np.exp(np.sin(x))
ax2.stem(x, y)
ax2.set_title('Plot 3')

ax3 = plt.subplot(gs[1, 1])
ax3.bar(['A', 'B', 'C', 'D'], [10, 20, 30, 40], color='y', alpha=0.7)
ax3.set_title('Plot 4')

# Adjust layout
plt.tight_layout()

# Show the figure
plt.show()

from scipy import stats
# Generate 1000 random numbers from a normal distribution with mean 0 and standard deviation 1
data = np.random.randn(1000)
# group1
group1 = data[:500]
# group2
group2 = data[500:]
# group3
group3 = np.random.randn(500)
# Independent t-test
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f'T-statistic: {t_stat}, P-value: {p_value}')

T-statistic: 0.31639492837773037, P-value: 0.7517689292338926

# One-way ANOVA
f_stat, p_value = stats.f_oneway(group1, group2, group3)
print(f'F-statistic: {f_stat}, P-value: {p_value}')

F-statistic: 0.05427300180328578, P-value: 0.9471753547403352

# Chi-square test for independence
observed_data = np.array([[10, 10, 20], [20, 20, 20]])
chi2_stat, p_value, dof, expected = stats.chi2_contingency(observed_data)
print(f'Chi2 statistic: {chi2_stat}, P-value: {p_value}, Degrees of freedom: {dof}')

Chi2 statistic: 2.7777777777777777, P-value: 0.24935220877729622, Degrees of freedom: 2

import statsmodels.api as sm
# remove missing values
df.dropna(inplace=True)
y=df['Sepal.Length'] # dependent variable
X=df[['Sepal.Width','Petal.Length','Petal.Width']] # independent variables
# Simple linear regression
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           Sepal.Length   R-squared:                       0.806
Model:                            OLS   Adj. R-squared:                  0.802
Method:                 Least Squares   F-statistic:                     201.4
Date:                Wed, 20 Sep 2023   Prob (F-statistic):           1.69e-51
Time:                        23:00:04   Log-Likelihood:                -60.431
No. Observations:                 149   AIC:                             128.9
Df Residuals:                     145   BIC:                             140.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            2.4966      0.276      9.055      0.000       1.952       3.042
Sepal.Width      0.5380      0.076      7.071      0.000       0.388       0.688
Petal.Length     0.4576      0.053      8.703      0.000       0.354       0.562
Petal.Width     -0.0160      0.122     -0.132      0.895      -0.256       0.224
==============================================================================
Omnibus:                        6.859   Durbin-Watson:                   2.011
Prob(Omnibus):                  0.032   Jarque-Bera (JB):                8.525
Skew:                          -0.286   Prob(JB):                       0.0141
Kurtosis:                       4.023   Cond. No.                         50.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

from sklearn.linear_model import LinearRegression

y = df['Sepal.Length']  # dependent variable
X = df[['Sepal.Width', 'Petal.Length', 'Petal.Width']]  # independent variables

model = LinearRegression()
model.fit(X, y)

print(f'Intercept: {model.intercept_}')
print(f'Coefficients: {model.coef_}')

Intercept: 2.4965574039645433
Coefficients: [ 0.53803901  0.4575952  -0.01601361]

import seaborn as sns
# Define custom colors for each species
colors = {'setosa': 'blue', 'versicolor': 'green', 'virginica': 'red'}

# Plot
g = sns.lmplot(data=df,
               x='Sepal.Width',
               y='Sepal.Length',
               hue='Species',
               palette=colors,  # Specify custom colors here
               fit_reg=True,
               legend=True,
               facet_kws={'legend_out': True})

# Calculate and display correlation for each species
species_groups = df.groupby('Species')
for species_name, species_group in species_groups:
    corr = species_group['Sepal.Width'].corr(species_group['Sepal.Length'])
    color = colors[species_name]  # Get the color for the species
    plt.text(species_group['Sepal.Width'].mean()+0.5, species_group['Sepal.Length'].mean()+0.5, f'Corr: {corr:.2f}', fontsize=12, color=color)


g.set_axis_labels('Sepal Width', 'Sepal Length')

# Show the plot
plt.show()
# save plot
#plt.savefig('seaborn.png', dpi=300, bbox_inches='tight')

Syntax	Description
`print("Hello, World!")`	Print the string "Hello, World!"
`# This is a comment`	Single-line comment
`"""Multiline string"""`	Multiline string (used for documentation)
`x = 5`	Assign the value 5 to variable x
`if/for/while condition:`	Conditional statement
`import module`	Import a Python module
`from module import x`	Import specific variable/function from a module

Data Type	Description	Example
int	Integer (whole number)	`42`
float	Floating-point (decimal)	`3.14`
str	String (text)	`'Hello, World!'`
bool	Boolean (True/False)	`True` or `False`
list	List (ordered collection)	`[1, 2, 3]`
tuple	Tuple (immutable collection)	`(1, 'apple', 3.14)`
set	Set (unordered collection)	`{1, 2, 3}`
dict	Dictionary (key-value pairs)	`{'name': 'John', 'age': 30}`
NoneType	None (represents absence)	`None`

Operation	Code Example	Description
Accessing an element by index	`my_list[index]`	Retrieves the element at the specified index in the list.
Slicing	`my_list[start:stop:step]`	Extracts a portion of the list based on start, stop, and step.
Accessing the last element	`my_list[-1]`	Accesses the last element of the list.
Adding an item at the end	`my_list.append(item)`	Appends an item to the end of the list.
Inserting an item at an index	`my_list.insert(index, item)`	Inserts an item at the specified index in the list.
Removing an item by value	`my_list.remove(item)`	Removes the first occurrence of the item with the given value.
Removing an item by index	`del my_list[index]`	Deletes the item at the specified index.
Checking if an item exists	`item in my_list`	Checks if an item exists in the list.
Finding the index of an item	`my_list.index(item)`	Returns the index of the first occurrence of the item.

Operation	Code Example	Description
Accessing a value by key	`my_dict[key]`	Retrieves the value associated with the specified key.
Adding a key-value pair	`my_dict[key] = value`	Inserts a new key-value pair into the dictionary.
Removing a key-value pair	`del my_dict[key]`	Deletes the key-value pair with the specified key.
Checking if a key exists	`key in my_dict`	Checks if a key exists in the dictionary.
Getting all keys	`my_dict.keys()`	Returns a list of all keys in the dictionary.
Getting all values	`my_dict.values()`	Returns a list of all values in the dictionary.
Getting key-value pairs	`my_dict.items()`	Returns a list of key-value pairs as tuples.

NumPy Function	Description
`np.array([1, 2, 3])`	Create a 1D array
`np.zeros((2, 3))`	Create a 2D array filled with zeros
`np.ones((2, 3))`	Create a 2D array filled with ones
`np.arange(0, 10, 2)`	Create an array with a range of values
`np.linspace(0, 1, 5)`	Create an array with evenly spaced values
`np.eye(3)`	Create a 3x3 identity matrix
`np.random.rand(2, 2)`	Create a 2x2 array with random values [0, 1]
`np.sum(arr)`	Sum of all elements in the array
`np.mean(arr)`	Mean of all elements in the array
`np.max(arr)`	Maximum value in the array
`np.min(arr)`	Minimum value in the array
`np.argmax(arr)`	Index of the maximum value
`np.argmin(arr)`	Index of the minimum value
`np.reshape(arr, (2, 3))`	Reshape the array
`np.transpose(arr)`	Transpose the array
`np.dot(arr1, arr2)`	Dot product of two arrays
`np.concatenate((a, b), axis=0)`	Concatenate arrays vertically (axis=0)
`np.vstack((a, b))`	Stack arrays vertically
`np.hstack((a, b))`	Stack arrays horizontally

1. Python Fundamentals (Day 1)¶

1.1 Setup Python¶

(a) Install from Anaconda Distribution

(b) Open anaconda prompt /terminal¶

1.2 Basic Python syntax and data types¶

Data type¶

Python Reserved Words¶

Operators¶

Most used python data structures are list and Dictionary¶

1. List¶

2. Dictionaries:¶

Add function from built-in library or third-party library¶

D¶

Accessing items from list and Dictionary¶

Accessing, Indexing, Adding, and Deleting Items Cheatsheet¶

Lists¶

Dictionaries¶

Control: if / for /while¶

Date time¶

Day 2 Advanced Python concepts¶

Some special functions and built-in functions in Python, including map, zip, filter, and reduce¶

Key Differences:¶

Output Type:¶

Functionality:¶

Use Cases:¶

2. Numpy and Data Analysis with Pandas (Week 2)¶

Numpy¶

Array Operations via numpy¶

Pandas [Data analysis library]¶

Data reading, exploration and writing¶

Data Manipulation¶

Data complex Filtering¶

Data Cleaning¶

Cleaning¶

pandas long query chain¶

3. Data Visualization and Real-World Projects (Day 3)¶

Statistical test¶

Advanced plotting with seaborn¶

Advanced statistical plot¶

Command	Description
`pd.read_csv(data)`	Read file
`pd.Series(data)`	Create a Series
`df.info()`	Get DataFrame info
`df.describe()`	Summary statistics
`pd.DataFrame(data)`	Create a DataFrame
`df['column_name']`	Select a single column
`df[['col1', 'col2']]`	Select multiple columns
`df.loc[row_label]`	Select a row by label
`df.iloc[row_index]`	Select a row by index
`df.head(n)`	Display the first n rows
`df.tail(n)`	Display the last n rows
`df.shape`	Get the shape (rows, columns)
`df.to_csv()`	Save data

Command	Description
`df[df['col'] > value]`	Boolean indexing
`df.query('expression')`	Query by expression
`df[(df['col1'] > val1) & (df['col2'] < val2)]`	Multiple conditions

Command	Description
`df.rename(columns={'old': 'new'})`	Rename columns
`df.drop_duplicates()`	Remove duplicates
`df.set_index('column_name')`	Set column as index
`df.reset_index()`	Reset index
`df.fillna(value)`	Fill missing values

	Name	Age	Gender	Salary
0	Alice	25	Female	50000
1	Bob	30	Male	60000
2	Charlie	35	Male	75000
3	David	40	Male	80000
4	Eva	45	Female	55000

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
0	5.1	3.5	1.4	0.2	Setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
count	150.000000	150.000000	149.000000	150.000000
mean	5.843333	3.057333	3.773154	1.199333
std	0.828066	0.435866	1.793104	0.762238
min	4.300000	2.000000	0.000000	0.100000
25%	5.100000	2.800000	1.500000	0.300000
50%	5.800000	3.000000	4.400000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
Sepal.Length	1.000000	-0.117570	0.856971	0.817941
Sepal.Width	-0.117570	1.000000	-0.412193	-0.366126
Petal.Length	0.856971	-0.412193	1.000000	0.939200
Petal.Width	0.817941	-0.366126	0.939200	1.000000

	Sepal.Length
Species
Setosa	5.100000
seTosa	5.000000
setosa	5.004167
versicolor	5.936000
virginica	6.588000

	mean	median	std	min	max
Species
setosa	5.006122	5.0	0.356141	4.3	5.8
versicolor	5.936000	5.9	0.516171	4.9	7.0
virginica	6.588000	6.5	0.635880	4.9	7.9

	Full Name	Employee Age	Salary	Salary_Increase	Yearly_Salary
0	DAVID	40	80000	88000.0	1056000.0
1	CHARLIE	35	75000	82500.0	990000.0
2	BOB	30	60000	66000.0	792000.0

	12	91	15	28	16	19	17	93	24	13	id	sst	59	86	69	35	8	83	56
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5.510590	7.809940	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	7.798259	8.325629	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.488465	1.036125	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4.336252	5.043532	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	7.459540	1.735971	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Sepal Length	Sepal Width	Petal Length	Petal Width	Sepal Area	Petal Area
0	7.0	3.2	4.7	1.4	22.40	6.58
1	6.9	3.1	4.9	1.5	21.39	7.35
3	6.7	3.1	4.7	1.5	20.77	7.05
5	6.7	3.1	4.4	1.4	20.77	6.16

Matplotlib Function	Description
`plt.plot(x, y)`	Plot a line graph with data points `x` and `y`.
`plt.scatter(x, y)`	Create a scatter plot with data points `x` and `y`.
`plt.bar(x, height)`	Generate a bar chart with values `x` and `height`.
`plt.hist(data, bins)`	Create a histogram with the given data and bins.
`plt.xlabel('xlabel')`	Set the label for the x-axis.
`plt.ylabel('ylabel')`	Set the label for the y-axis.
`plt.title('title')`	Add a title to the plot.
`plt.legend()`	Display a legend for labeled elements on the plot.
`plt.grid(True)`	Enable gridlines on the plot.
`plt.savefig('filename.png')`	Save the current plot as an image file.

Seaborn Function	Description
`sns.scatterplot(x, y, data)`	Create a scatter plot with two numeric variables
`sns.lineplot(x, y, data)`	Create a line plot with two numeric variables
`sns.barplot(x, y, data)`	Create a bar plot with a categorical and numeric variable
`sns.countplot(x, data)`	Create a count plot for categorical data
`sns.boxplot(x, y, data)`	Create a box plot to visualize data distribution
`sns.violinplot(x, y, data)`	Create a violin plot to show data distribution
`sns.heatmap(data)`	Create a heatmap to visualize data correlations
`sns.pairplot(data)`	Create a pair plot for multiple variables
`sns.distplot(data)`	Create a distribution plot for a numeric variable
`sns.lmplot(x, y, data)`	Create a linear regression plot
`sns.catplot(x, y, data)`	Create a categorical plot