Instructions
Requirements and Specifications
Source Code
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Define function to display main menu and ask for option
def menu():
while True:
try :
print("Please choose from the following options:")
print("\t1 - Load data from a file")
print("\t2 - View data")
print("\t3 - Clean data")
print("\t4 - Analyse data")
print("\t5 - Visualise data")
print("\t6 - Save data to a file")
print("\t7 - Quit")
option = int(input())
if option >= 1 and option <= 7:
return option
else:
print("Please enter a valid option.")
except:
print("Please enter a valid menu option")
# Define a function to display the menu for the 'Clean Data' option
def cleandata_menu():
while True:
try :
print("Cleaning data:")
print("\t1 - Drop rows with missing values")
print("\t2 - Fill missing values")
print("\t3 - Drop duplicate rows")
print("\t4 - Drop column")
print("\t5 - Rename column")
print("\t6 - Finish cleaning")
option = int(input())
if option >= 1 and option <= 6:
return option
else:
print("Please enter a valid option.")
except:
print("Please enter a valid menu option")
# Define a function to get an integer from user. The integer must be between [lb, ub]
def get_int(message, lb, ub):
"""
Requests an integer input 'n' such that lb <= n <= ub
"""
while True:
try :
option = int(input(message))
if option >= lb and option <= ub:
return option
else:
print(f"Please enter a value between {lb} and {ub}.")
except:
print("Please enter a valid integer.")
# Main code
current_data = None # Variable to store the current loaded data
running = True
while running:
option = menu()
if option == 1:
# Ask for file name
file_name = input("Enter file name: ")
try:
data = pd.read_csv(file_name)
current_data = data
print(f" File {file_name} correctly loaded!")
# Ask if s/he wants to set a column name as index
while True:
col_name = input("Enter column name to be set as index: ")
if len(col_name) > 0:
if col_name in current_data.columns:
current_data = current_data.set_index(col_name)
current_data = current_data.drop(columns=[col_name])
break
else:
print("Sorry, the data does not contain a column with that name.")
else:
break
except:
print("File does not exist or could not be loaded.")
elif option == 2:
# Print
if current_data:
print(current_data)
else:
print("No data loaded.")
elif option == 3:
if current_data:
while True:
opt = cleandata_menu()
if opt == 1: # drop rows with missing values
# Ask for threshold
treshold = get_int("Enter the treshold for dropping rows: ", 1, np.inf)
current_data = current_data[current_data.isnull().sum(axis = 1) < treshold]
elif opt == 2: # fill missing values
replacement = get_int("Enter the replacement value", -np.inf, np.inf)
current_data.fillna(replacement)
elif opt == 3: # Drop duplicate rows
# Get current amount of rowas
n_current = len(current_data)
current_data.drop_duplicates()
n_new = len(current_data)
print(f"{n_current-n_new} rows dropped.")
elif opt == 4: # drop dolumn
# Ask name of column
print("Which column do you want to drop? (leave blank for none)")
for c in current_data.columns:
print(f"\t{c}")
column = input()
if len(column) > 0:
if column in current_data.columns:
current_data = current_data.drop(columns = [column])
print(f"{column} dropped.")
else:
print("Invalid selection!")
else:
print("No column dropped.")
elif opt == 5: # Rename column
print("Which column do you want to rename? (leave blank for none)")
for c in current_data.columns:
print(f"\t{c}")
column = input()
if len(column) > 0:
if column in current_data.columns:
# Ask for new name
new_column = input("Enter the new name: ")
current_data.rename(columns={column:new_column})
print(f"{column} renamed to {new_column}.")
else:
print("Invalid selection!")
else:
print("No column renamed.")
elif opt == 6: # finish cleaning
break
print(current_data)
else:
print("No data loaded.")
elif option == 4: # Analyse data
if current_data:
for c in current_data.columns:
print(c)
print('-'*len(c))
print("{:<15s}:{:>5d}".format("number of values (n)", current_data[c].count()))
print("{:<15s}:{:>5.2f}".format("minimum", current_data[c].min()))
print("{:<15s}:{:>5.2f}".format("maximum", current_data[c].max()))
print("{:<15s}:{:>5.2f}".format("mean", current_data[c].mean()))
print("{:<15s}:{:>5.2f}".format("median", current_data[c].median()))
print("{:<15s}:{:>5.2f}".format("standard deviation", current_data[c].std()))
print("{:<15s}:{:>5.2f}".format("std. err. of mean", current_data[c].sem()))
# Display correlation table
print(current_data.corr())
else:
print("No data loaded.")
elif option == 5: # Visualize
if current_data:
while True:
# Ask for plot type
print("Please choose from the following kinds: line, bar, box")
plot_type = input()
if plot_type.lower() in ['line', 'bar', 'box']:
print("Do you want subplots? (y/n)")
yn = input()
if yn.lower() in ['y', 'n']:
plot_title = input("Please enter the title for the plot (leave blank for no title)\n")
x_label = input("Please enter the x-axis label (leave blank for no label).\n")
y_label = input("Please enter the y-axis label (leave balnk for no label).\n")
if yn == 'y': # subplots
n_columns = len(current_data.columns)
if plot_type != 'box':
fig, axes = plt.subplots(nrows = n_columns, ncols = 1)
for i, c in enumerate(current_data.columns):
if plot_type == 'line':
current_data.plot(y=c, use_index = True, ax = axes[i])
elif plot_type == 'bar':
current_data.plot.bar(y=c, use_index = True, ax = axes[i])
axes[i].set_title(plot_title)
axes[i].set_xlabel(x_label)
axes[i].set_ylabel(y_label)
plt.show()
break
else:
plt.figure()
current_data.boxplot()
plt.title(plot_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()
break
else:
n_columns = len(current_data.columns)
if plot_type != 'box':
for i, c in enumerate(current_data.columns):
plt.figure()
current_data.plot(y=c, use_index = True)
plt.title(plot_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()
break
else:
plt.figure()
current_data.boxplot()
plt.title(plot_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()
break
else:
print("Invalid selection!")
else:
print("Invalid selection!")
else:
print("No data loaded.")
elif option == 6: #Save to a file
if current_data:
file_name = input("Enter the filename, including extension: ")
try:
current_data.to_csv(file_name,sep=',')
except:
print(f"Data could not be saved to {file_name}")
else:
print("No data loaded.")
elif option == 7:
running = False
print("Goodbye")