Commit be3b5e3d authored by LEVREL Victor's avatar LEVREL Victor
Browse files

ajout code Victor

parent c62e79c8
File added
tp1/images_rapport/ACP_cluster.png

63.8 KB

tp1/images_rapport/TSNE_cluster.png

49.3 KB

tp1/images_rapport/cosine_all.png

30.4 KB

tp1/images_rapport/cosine_selected.png

32.1 KB

tp1/images_rapport/euclidean_all.png

29.4 KB

tp1/images_rapport/euclidean_selected.png

30.2 KB

tp1/images_rapport/mahalanobis_selected.png

27.4 KB

asttokens==2.4.1
certifi==2024.8.30
charset-normalizer==3.3.2
comm==0.2.2
contourpy==1.3.0
cycler==0.12.1
debugpy==1.8.6
decorator==5.1.1
exceptiongroup==1.2.2
executing==2.1.0
fonttools==4.54.1
idna==3.10
ipykernel==6.29.5
ipython==8.27.0
jedi==0.19.1
joblib==1.4.2
jupyter_client==8.6.3
jupyter_core==5.7.2
kiwisolver==1.4.7
matplotlib==3.9.2
matplotlib-inline==0.1.7
nest-asyncio==1.6.0
numpy==2.1.1
packaging==24.1
panda==0.3.1
pandas==2.2.3
parso==0.8.4
pexpect==4.9.0
pillow==10.4.0
platformdirs==4.3.6
prompt_toolkit==3.0.48
psutil==6.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
Pygments==2.18.0
pyparsing==3.1.4
python-dateutil==2.9.0.post0
pytz==2024.2
pyzmq==26.2.0
requests==2.32.3
scikit-learn==1.5.2
scipy==1.14.1
seaborn==0.13.2
six==1.16.0
stack-data==0.6.3
tenacity==9.0.0
threadpoolctl==3.5.0
tornado==6.4.1
traitlets==5.14.3
typing_extensions==4.12.2
tzdata==2024.2
urllib3==2.2.3
wcwidth==0.2.13
File added
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
# Description: This file contains the functions to extract the data from the csv files and the functions to plot the data.
# =================================================================================
# LIBRAIRIES
# =================================================================================
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import distance
from sklearn.feature_selection import SelectKBest, f_classif
# =================================================================================
# FUNCTIONS
# =================================================================================
#__________________________________________________________________________________
# Functions to calculate the metrics
def minkowski(u,v,L):
"""
Compute the Minkowski distance for u and v of degree L
for L=2 it's the Euclidian distance
Parameters
----------
u : numpy.ndarray, the first input vector
v : numpy.ndarray, the second input vector
L : int, degree of the Minkowski distance
Output
------
dist : float, the Minkowski distance between u and v of degre L
"""
#TODO check if inputs are of the correct type
#TODO check if len(u) == len(v)
d = len(u)
S = 0
for k in range(0,d) :
diff = u[k] - v[k]
S += (abs(diff))**L
dist = S**(1/L)
return dist
def cosine_distance(u, v):
"""
Compute the cosine distance between two vectors p and q
Parameters
----------
u : numpy.ndarray, the first input vector
v : numpy.ndarray, the second input vector
Output
------
cosine_distance : float, the cosine distance between u and v
"""
cosine_similarity = np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
return (1 - cosine_similarity)
#return distance.cosine(u, v)
#__________________________________________________________________________________
# Functions to calculate the metrics
def get_center_of_class(chosen_class, data, labels):
"""
Compute the center of a classs
Parameters
----------
chosen_class : int, the class for which we want to compute the center
data : numpy.ndarray, the input dataset
labels : numpy.ndarray, the labels of the dataset
Output
------
x_center : numpy.ndarray, the center of the class
"""
#TODO check if the class exist in the labels
#TODO check if the data and labels are the same size
x_center = np.zeros_like(data[0])
L = 0
for i in range(len(data)):
if labels[i] == chosen_class :
L += 1
x_center += data[i]
x_center = x_center / L
return x_center
def dist_intra_class(chosen_dist, chosen_class, genes_data, genes_labels, inv_cov_matrix=None):
"""
Compute the maximum distance between the center of a class and the points of the class
Parameters
----------
chosen_dist : str, the distance to use
chosen_class : int, the class for which we want to compute the distance
genes_data : numpy.ndarray, the input dataset
genes_labels : numpy.ndarray, the labels of the dataset
Output
------
max_dist : float, the maximum distance between the center of the class and the points of the class
"""
center = get_center_of_class(chosen_class, genes_data, genes_labels)
max_dist = 0.0
if chosen_class not in ["BRCA", "PRAD", "KIRC", "LUAD", "COAD"]:
print("Unrecognise class please try one of the following : 'BRCA', 'PRAD', 'KIRC', 'LUAD', 'COAD'")
return -1
for i in range(len(genes_data)):
if chosen_class == genes_labels[i][0]:
if chosen_dist == 'Euclidean' :
max_dist = max(max_dist, minkowski(center, genes_data[i], 2))
elif chosen_dist == 'Mahalanobis':
max_dist = max(max_dist, distance.mahalanobis(center, genes_data[i], inv_cov_matrix))
elif chosen_dist == "Cosine":
max_dist = max(max_dist, cosine_distance(center, genes_data[i]))
else :
print("Unrecognise distance please try one of the following : 'Euclidean', 'Mahalanobis', 'Cosine'")
return -1
return max_dist
def get_dist_between_classes(class_1, class_2, chosen_dist, genes_data, genes_labels, inv_cov_matrix=None):
"""
Compute the minimum distance between two classes with the chosen distance
Parameters
----------
class_1 : int, the first class
class_2 : int, the second class
chosen_dist : str, the distance to use
genes_data : numpy.ndarray, the input dataset
genes_labels : numpy.ndarray, the labels of the dataset
Output
------
min_center : float, distance between two classes
"""
center_class_2 = get_center_of_class(class_2, genes_data, genes_labels)
#Initialisation
if chosen_dist == 'Euclidean' :
min_center = minkowski(genes_data[0], center_class_2, 2)
elif chosen_dist == "Cosine":
min_center = cosine_distance(genes_data[0], center_class_2)
elif chosen_dist == 'Mahalanobis':
min_center = distance.mahalanobis(genes_data[0], center_class_2, inv_cov_matrix)
else :
return -1
#TODO check if not empty
for i in range(len(genes_data)):
if class_1 == genes_labels[i][0]:
if chosen_dist == 'Euclidean' :
min_center = min(min_center, minkowski(genes_data[i], center_class_2, 2))
elif chosen_dist == 'Mahalanobis':
min_center = min(min_center, distance.mahalanobis(genes_data[i], center_class_2, inv_cov_matrix))
elif chosen_dist == "Cosine":
min_center = min(min_center, cosine_distance(genes_data[i], center_class_2))
return min_center
def dist_inter_class(class_1, class_2, chosen_distance, genes_data, genes_labels, inv_cov_matrix=None):
"""
Compute the minimum distance between two classes with the chosen distance
Parameters
----------
class_1 : int, the first class
class_2 : int, the second class
chosen_dist : str, the distance to use
genes_data : numpy.ndarray, the input dataset
genes_labels : numpy.ndarray, the labels of the dataset
Output
------
min : float, distance between two classes
"""
A = get_dist_between_classes(class_1, class_2, chosen_distance, genes_data, genes_labels, inv_cov_matrix)
B = get_dist_between_classes(class_2, class_1, chosen_distance, genes_data, genes_labels, inv_cov_matrix)
return min(A, B)
def get_covariance_matrix(array):
"""
Compute the covariance matrix of an array using numpy module
Parameters
----------
array : numpy.ndarray, the input array
Output
------
cov_matrix : numpy.ndarray, the covariance matrix of the array
"""
array = np.array(array, dtype=float)
cov_matrix = np.cov(array, rowvar=False)
return cov_matrix
def is_invertible(matrix):
"""
Tell if the matrix is invertible
Parameters
----------
matrix : numpy.ndarray, the input array (matrix)
Output
------
boolean : bool, True if the matrix is invertible, False otherwise
"""
return matrix.shape[0] == matrix.shape[1] and np.linalg.matrix_rank(matrix) == matrix.shape[0]
def invert_conv_matrix(cov_matrix):
"""
Invert the covariance matrix, if it is not invertible, use the pseudo-inverse
Parameters
----------
cov_matrix : numpy.ndarray, the input covariance matrix
Output
------
inv_cov_matrix : numpy.ndarray, the inverse of the covariance
"""
if not is_invertible(cov_matrix):
return np.linalg.pinv(cov_matrix)
else:
return np.linalg.inv(cov_matrix)
def get_genes_from_class(genes_data, genes_labels, chosen_class):
"""
Get the genes from a specific chosen class
Parameters
----------
genes_data : numpy.ndarray, the input dataset
genes_labels : numpy.ndarray, the labels of the dataset
chosen_class : int, the class for which we want to get the genes
Output
------
class_genes : numpy.ndarray, the genes of the chosen class
"""
class_indices = np.where(genes_labels.ravel() == chosen_class)[0]
class_genes = genes_data[class_indices]
return class_genes
def get_cov_matrix_of_class(genes_data, genes_labels, chosen_class):
"""
Get the covariance matrix of the chosen class
Parameters
----------
genes_data : numpy.ndarray, the input dataset
genes_labels : numpy.ndarray, the labels of the dataset
chosen_class : int, the class for which we want to get the covariance matrix
Output
------
cov_matrix : numpy.ndarray, the covariance matrix of the chosen class
"""
class_genes = get_genes_from_class(genes_data, genes_labels, chosen_class)
cov_matrix = get_covariance_matrix(class_genes)
return cov_matrix
def overlap(class_1, class_2, chosen_distance, genes_data, genes_labels, inv_cov_matrix=None):
"""
Compute the overlap between two classes
Parameters
----------
class_1 : int, the first class
class_2 : int, the second class
chosen_dist : str, the distance to use
genes_data : numpy.ndarray, the input dataset
genes_labels : numpy.ndarray, the labels of the dataset
Output
------
overlap : float, the overlap between the two classes
"""
A = dist_intra_class(chosen_distance, class_1, genes_data, genes_labels, inv_cov_matrix)
B = dist_intra_class(chosen_distance, class_2, genes_data, genes_labels, inv_cov_matrix)
C = dist_inter_class(class_1, class_2, chosen_distance, genes_data, genes_labels, inv_cov_matrix)
return (A+B)/(2*C)
def get_all_overlaps_for_distance(genes_data, genes_labels, chosen_dist):
"""
Compute the overlap between all the classes with the chosen distance
Parameters
----------
genes_data : numpy.ndarray, the input dataset
genes_labels : numpy.ndarray, the labels of the dataset
chosen_dist : str, the distance to use
Output
------
overlaps : list, list of tuples with the overlap between the two classes
"""
overlaps = []
all_classes = ["BRCA", "PRAD", "KIRC", "LUAD", "COAD"]
if chosen_dist == 'Euclidean' or chosen_dist == 'Cosine':
for i in range (len(all_classes)):
for k in range(i, len(all_classes)):
overlaps.append((overlap(all_classes[i], all_classes[k], chosen_dist, genes_data, genes_labels), all_classes[i], all_classes[k], chosen_dist))
elif chosen_dist == 'Mahalanobis':
inv_cov_matrices = [[]] * len(all_classes)
for i in range (len(all_classes)):
cov_matrix = get_cov_matrix_of_class(genes_data, genes_labels, all_classes[i])
invCovMatrix = invert_conv_matrix(cov_matrix)
inv_cov_matrices[i] = invCovMatrix
for j in range (len(all_classes)):
for k in range(j, len(all_classes)):
overlaps.append((overlap(all_classes[j], all_classes[k], chosen_dist, genes_data, genes_labels, inv_cov_matrices[k]), all_classes[j], all_classes[k], chosen_dist))
else :
print("Unrecognise distance please try one of the following : 'Euclidean', 'Mahalanobis', 'Cosine'")
return overlaps
def get_best_features(data, labels, deb, end, pas):
"""
Compute the best features for the classification
Parameters
----------
data : numpy.ndarray, the input dataset
labels : numpy.ndarray, the labels of the dataset
deb : int, the beginning of the range
end : int, the end of the range
pas : int, the step
Output
------
best_features : numpy.ndarray, the best features for the classification
best_overlaps : list, list of tuples with the overlap between the two classes
best_features.shape[1] : int, the number of best features
"""
best_features = []
best_overlaps = []
all_classes = ["BRCA", "PRAD", "KIRC", "LUAD", "COAD"]
for j in range(deb, end, pas):
selector = SelectKBest(score_func=f_classif, k=j)
selected_features = selector.fit_transform(data, labels.flatten())
best_features = selected_features
overlaps = []
inv_cov_matrices = [[]] * len(all_classes)
for i in range (len(all_classes)):
cov_matrix = get_cov_matrix_of_class(selected_features, labels, all_classes[i])
invCovMatrix = invert_conv_matrix(cov_matrix)
inv_cov_matrices[i] = invCovMatrix
for i in range (len(all_classes)):
for k in range(i+1, len(all_classes)):
chosen_dist = "Mahalanobis"
overlaps.append((overlap(all_classes[i], all_classes[k], chosen_dist, selected_features, labels, inv_cov_matrices[k]), all_classes[i], all_classes[k], chosen_dist))
if j == deb :
best_features = selected_features
best_overlaps = overlaps
else :
for i in range (len(overlaps)):
if np.mean([overlaps[i][0] for i in range(len(overlaps))]) < np.mean([best_overlaps[i][0] for i in range(len(best_overlaps))]):
best_features = selected_features
best_overlaps = overlaps
return best_features, best_overlaps, best_features.shape[1]
def print_table_overlaps(results, title):
"""
Print the overlaps in a table
Parameters
----------
results : list, list of tuples with the overlap between the two classes
"""
labels = ['BRCA', 'PRAD', 'KIRC', 'LUAD', 'COAD']
df = pd.DataFrame(index=labels, columns=labels)
for score, source, target, metric in results:
df.loc[source, target] = round(score, 4)
fig, ax = plt.subplots(figsize=(8, 6))
df = df.fillna("")
table = ax.table(cellText=df.values, rowLabels=df.index, colLabels=df.columns, cellLoc='center', loc='center')
table.set_fontsize(12)
table.scale(1.5, 1.5)
ax.axis('tight')
ax.axis('off')
plt.savefig('../images_rapport/'+ title + '.png', bbox_inches='tight')
plt.show()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment