ajout code Victor

be3b5e3d · LEVREL Victor · c62e79c8 · be3b5e3d · be3b5e3d · be3b5e3d
Commit be3b5e3d authored 6 months ago by LEVREL Victor
13 changed files
--- a/tp1/Révisé-TP1-specification.pdf
+++ b/tp1/Révisé-TP1-specification.pdf
--- a/tp1/images_rapport/ACP_cluster.png
+++ b/tp1/images_rapport/ACP_cluster.png
--- a/tp1/images_rapport/TSNE_cluster.png
+++ b/tp1/images_rapport/TSNE_cluster.png
--- a/tp1/images_rapport/cosine_all.png
+++ b/tp1/images_rapport/cosine_all.png
--- a/tp1/images_rapport/cosine_selected.png
+++ b/tp1/images_rapport/cosine_selected.png
--- a/tp1/images_rapport/euclidean_all.png
+++ b/tp1/images_rapport/euclidean_all.png
--- a/tp1/images_rapport/euclidean_selected.png
+++ b/tp1/images_rapport/euclidean_selected.png
--- a/tp1/images_rapport/mahalanobis_selected.png
+++ b/tp1/images_rapport/mahalanobis_selected.png
--- a/tp1/requirements.txt
+++ b/tp1/requirements.txt
+asttokens==2.4.1
+certifi==2024.8.30
+charset-normalizer==3.3.2
+comm==0.2.2
+contourpy==1.3.0
+cycler==0.12.1
+debugpy==1.8.6
+decorator==5.1.1
+exceptiongroup==1.2.2
+executing==2.1.0
+fonttools==4.54.1
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+jedi==0.19.1
+joblib==1.4.2
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+kiwisolver==1.4.7
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+nest-asyncio==1.6.0
+numpy==2.1.1
+packaging==24.1
+panda==0.3.1
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==10.4.0
+platformdirs==4.3.6
+prompt_toolkit==3.0.48
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+Pygments==2.18.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+pytz==2024.2
+pyzmq==26.2.0
+requests==2.32.3
+scikit-learn==1.5.2
+scipy==1.14.1
+seaborn==0.13.2
+six==1.16.0
+stack-data==0.6.3
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tornado==6.4.1
+traitlets==5.14.3
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+wcwidth==0.2.13
--- a/tp1/src/__pycache__/utils.cpython-310.pyc
+++ b/tp1/src/__pycache__/utils.cpython-310.pyc
--- a/tp1/src/partie1_Levrel.ipynb
+++ b/tp1/src/partie1_Levrel.ipynb
--- a/tp1/src/partie2_Levrel.ipynb
+++ b/tp1/src/partie2_Levrel.ipynb
--- a/tp1/src/utils.py
+++ b/tp1/src/utils.py
+# Description: This file contains the functions to extract the data from the csv files and the functions to plot the data.
+
+# =================================================================================
+# LIBRAIRIES
+# ================================================================================= 
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.spatial import distance
+from sklearn.feature_selection import SelectKBest, f_classif
+
+
+
+# =================================================================================
+# FUNCTIONS
+# =================================================================================
+
+#__________________________________________________________________________________
+# Functions to calculate the metrics
+
+
+
+def minkowski(u,v,L):  
+    """ 
+    Compute the Minkowski distance for u and v of degree L
+    for L=2 it's the Euclidian distance
+    
+    Parameters
+    ----------
+        u       :  numpy.ndarray,   the first input vector        
+        v       :  numpy.ndarray,   the second input vector
+        L       :  int,             degree of the Minkowski distance
+        
+    Output
+    ------
+        dist    :  float,           the Minkowski distance between u and v of degre L
+    """
+    #TODO check if inputs are of the correct type
+    #TODO check if len(u) == len(v)
+
+    d = len(u)
+    S = 0
+    for k in range(0,d) :
+        diff = u[k] - v[k]
+        S += (abs(diff))**L
+    
+    dist = S**(1/L) 
+
+    return dist
+
+def cosine_distance(u, v):
+    """
+    Compute the cosine distance between two vectors p and q
+
+    Parameters
+    ----------
+        u : numpy.ndarray, the first input vector
+        v : numpy.ndarray, the second input vector
+
+    Output
+    ------
+        cosine_distance : float, the cosine distance between u and v
+    """
+
+    cosine_similarity = np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
+
+    return (1 - cosine_similarity)
+    #return distance.cosine(u, v)
+
+
+#__________________________________________________________________________________
+# Functions to calculate the metrics
+
+
+def get_center_of_class(chosen_class, data, labels):
+    """
+    Compute the center of a classs
+
+    Parameters
+    ----------
+        chosen_class :  int,            the class for which we want to compute the center
+        data :          numpy.ndarray,  the input dataset
+        labels :        numpy.ndarray,  the labels of the dataset
+
+    Output
+    ------
+        x_center : numpy.ndarray, the center of the class
+    """
+
+    #TODO check if the class exist in the labels
+    #TODO check if the data and labels are the same size
+
+    x_center = np.zeros_like(data[0])
+    L = 0
+
+    for i in range(len(data)):
+        if labels[i] == chosen_class :
+            L += 1
+            x_center += data[i]
+
+    x_center = x_center / L
+    
+    return x_center
+
+def dist_intra_class(chosen_dist, chosen_class, genes_data, genes_labels, inv_cov_matrix=None):
+    """
+    Compute the maximum distance between the center of a class and the points of the class
+
+    Parameters
+    ----------
+        chosen_dist :   str,            the distance to use
+        chosen_class :  int,            the class for which we want to compute the distance
+        genes_data :   numpy.ndarray,  the input dataset
+        genes_labels :  numpy.ndarray,  the labels of the dataset
+    
+    Output
+    ------
+        max_dist : float, the maximum distance between the center of the class and the points of the class
+    """
+
+    center = get_center_of_class(chosen_class, genes_data, genes_labels)
+    max_dist = 0.0
+
+    if chosen_class not in ["BRCA", "PRAD", "KIRC", "LUAD", "COAD"]:
+        print("Unrecognise class please try one of the following : 'BRCA', 'PRAD', 'KIRC', 'LUAD', 'COAD'")
+        return -1
+
+    for i in range(len(genes_data)):
+
+        if chosen_class == genes_labels[i][0]:
+
+            if chosen_dist == 'Euclidean' :
+                max_dist = max(max_dist, minkowski(center, genes_data[i], 2))
+
+            elif chosen_dist == 'Mahalanobis':
+                max_dist = max(max_dist, distance.mahalanobis(center, genes_data[i], inv_cov_matrix))
+
+            elif chosen_dist == "Cosine":
+                max_dist = max(max_dist, cosine_distance(center, genes_data[i]))
+
+            else :
+                print("Unrecognise distance please try one of the following : 'Euclidean', 'Mahalanobis', 'Cosine'")
+                return -1
+    
+    return max_dist
+
+def get_dist_between_classes(class_1, class_2, chosen_dist, genes_data, genes_labels, inv_cov_matrix=None):
+    """
+    Compute the minimum distance between two classes with the chosen distance
+
+    Parameters
+    ----------
+        class_1 :       int,            the first class
+        class_2 :       int,            the second class
+        chosen_dist :   str,            the distance to use
+        genes_data :    numpy.ndarray,  the input dataset
+        genes_labels :  numpy.ndarray,  the labels of the dataset
+
+    Output
+    ------
+        min_center :    float,          distance between two classes
+    """
+
+    center_class_2 = get_center_of_class(class_2, genes_data, genes_labels)
+
+    #Initialisation
+    if chosen_dist == 'Euclidean' :
+        min_center = minkowski(genes_data[0], center_class_2, 2)
+    elif chosen_dist == "Cosine":
+        min_center = cosine_distance(genes_data[0], center_class_2)
+    elif chosen_dist == 'Mahalanobis':
+        min_center = distance.mahalanobis(genes_data[0], center_class_2, inv_cov_matrix)
+    else :
+        return -1
+    
+    #TODO check if not empty
+
+    for i in range(len(genes_data)):
+
+        if class_1 == genes_labels[i][0]:
+
+            if chosen_dist == 'Euclidean' :
+                min_center = min(min_center, minkowski(genes_data[i], center_class_2, 2))
+
+            elif chosen_dist == 'Mahalanobis':
+                min_center = min(min_center, distance.mahalanobis(genes_data[i], center_class_2, inv_cov_matrix))
+
+            elif chosen_dist == "Cosine":
+                min_center = min(min_center, cosine_distance(genes_data[i], center_class_2))
+
+    return min_center
+
+def dist_inter_class(class_1, class_2, chosen_distance, genes_data, genes_labels, inv_cov_matrix=None):
+    """
+    Compute the minimum distance between two classes with the chosen distance
+
+    Parameters
+    ----------
+        class_1 :       int,            the first class
+        class_2 :       int,            the second class
+        chosen_dist :   str,            the distance to use
+        genes_data :    numpy.ndarray,  the input dataset
+        genes_labels :  numpy.ndarray,  the labels of the dataset
+
+    Output
+    ------
+        min :    float,          distance between two classes
+    """
+
+    A = get_dist_between_classes(class_1, class_2, chosen_distance, genes_data, genes_labels, inv_cov_matrix)
+    B = get_dist_between_classes(class_2, class_1, chosen_distance, genes_data, genes_labels, inv_cov_matrix)
+
+    return min(A, B)
+
+def get_covariance_matrix(array):
+    """
+    Compute the covariance matrix of an array using numpy module
+
+    Parameters
+    ----------
+        array :    numpy.ndarray,  the input array
+
+    Output
+    ------
+        cov_matrix :    numpy.ndarray,          the covariance matrix of the array
+    """
+
+    array = np.array(array, dtype=float)
+    cov_matrix = np.cov(array, rowvar=False)
+    
+    return cov_matrix
+
+def is_invertible(matrix):
+    """
+    Tell if the matrix is invertible
+    Parameters
+    ----------
+        matrix :    numpy.ndarray,  the input array (matrix)
+
+    Output
+    ------
+        boolean :    bool,          True if the matrix is invertible, False otherwise
+    """
+    return matrix.shape[0] == matrix.shape[1] and np.linalg.matrix_rank(matrix) == matrix.shape[0]
+
+def invert_conv_matrix(cov_matrix):
+    """
+    Invert the covariance matrix, if it is not invertible, use the pseudo-inverse
+
+    Parameters
+    ----------
+        cov_matrix :    numpy.ndarray,  the input covariance matrix
+
+    Output
+    ------
+        inv_cov_matrix :    numpy.ndarray,          the inverse of the covariance
+    """
+    if not is_invertible(cov_matrix):
+        return np.linalg.pinv(cov_matrix)
+        
+    else:
+        return np.linalg.inv(cov_matrix)
+
+def get_genes_from_class(genes_data, genes_labels, chosen_class):
+    """
+    Get the genes from a specific chosen class
+
+    Parameters
+    ----------
+        genes_data :    numpy.ndarray,  the input dataset
+        genes_labels :  numpy.ndarray,  the labels of the dataset
+        chosen_class :  int,            the class for which we want to get the genes
+
+    Output
+    ------
+        class_genes :    numpy.ndarray,          the genes of the chosen class
+    """
+    class_indices = np.where(genes_labels.ravel() == chosen_class)[0]
+    class_genes = genes_data[class_indices]
+
+    return class_genes
+
+def get_cov_matrix_of_class(genes_data, genes_labels, chosen_class):
+    """
+    Get the covariance matrix of the chosen class
+
+    Parameters
+    ----------
+        genes_data :    numpy.ndarray,  the input dataset
+        genes_labels :  numpy.ndarray,  the labels of the dataset
+        chosen_class :  int,            the class for which we want to get the covariance matrix
+
+    Output
+    ------
+        cov_matrix :    numpy.ndarray,          the covariance matrix of the chosen class
+    """
+    class_genes = get_genes_from_class(genes_data, genes_labels, chosen_class)
+    cov_matrix = get_covariance_matrix(class_genes)
+
+    return cov_matrix
+
+def overlap(class_1, class_2, chosen_distance, genes_data, genes_labels, inv_cov_matrix=None): 
+    """
+    Compute the overlap between two classes
+
+    Parameters
+    ----------
+        class_1 :       int,            the first class
+        class_2 :       int,            the second class
+        chosen_dist :   str,            the distance to use
+        genes_data :    numpy.ndarray,  the input dataset
+        genes_labels :  numpy.ndarray,  the labels of the dataset
+
+    Output
+    ------
+        overlap :    float,          the overlap between the two classes
+    """
+    A = dist_intra_class(chosen_distance, class_1, genes_data, genes_labels, inv_cov_matrix)
+    B = dist_intra_class(chosen_distance, class_2, genes_data, genes_labels, inv_cov_matrix)
+    C = dist_inter_class(class_1, class_2, chosen_distance, genes_data, genes_labels, inv_cov_matrix)
+
+    return (A+B)/(2*C)
+
+def get_all_overlaps_for_distance(genes_data, genes_labels, chosen_dist):
+    """
+    Compute the overlap between all the classes with the chosen distance
+
+    Parameters
+    ----------
+        genes_data :    numpy.ndarray,  the input dataset
+        genes_labels :  numpy.ndarray,  the labels of the dataset
+        chosen_dist :   str,            the distance to use
+
+    Output
+    ------
+        overlaps :    list,          list of tuples with the overlap between the two classes
+    """
+
+    overlaps = []
+
+    all_classes = ["BRCA", "PRAD", "KIRC", "LUAD", "COAD"]
+
+    if chosen_dist == 'Euclidean' or chosen_dist == 'Cosine':
+        for i in range (len(all_classes)):
+            for k in range(i, len(all_classes)):
+                overlaps.append((overlap(all_classes[i], all_classes[k], chosen_dist, genes_data, genes_labels), all_classes[i], all_classes[k], chosen_dist))
+
+
+    elif chosen_dist == 'Mahalanobis':
+
+        inv_cov_matrices = [[]] * len(all_classes)
+        for i in range (len(all_classes)):
+            cov_matrix = get_cov_matrix_of_class(genes_data, genes_labels, all_classes[i])
+            invCovMatrix = invert_conv_matrix(cov_matrix)
+            inv_cov_matrices[i] = invCovMatrix
+
+        for j in range (len(all_classes)):
+            for k in range(j, len(all_classes)):
+                overlaps.append((overlap(all_classes[j], all_classes[k], chosen_dist, genes_data, genes_labels, inv_cov_matrices[k]), all_classes[j], all_classes[k], chosen_dist))
+    else :
+        print("Unrecognise distance please try one of the following : 'Euclidean', 'Mahalanobis', 'Cosine'")
+
+
+
+    return overlaps
+
+def get_best_features(data, labels, deb, end, pas):
+    """
+    Compute the best features for the classification
+
+    Parameters
+    ----------
+        data :    numpy.ndarray,  the input dataset
+        labels :  numpy.ndarray,  the labels of the dataset
+        deb :     int,            the beginning of the range
+        end :     int,            the end of the range
+        pas :     int,            the step
+
+    Output
+    ------
+        best_features :    numpy.ndarray,          the best features for the classification
+        best_overlaps :    list,                   list of tuples with the overlap between the two classes
+        best_features.shape[1] : int,              the number of best features
+    """
+
+    best_features = []
+    best_overlaps = []
+    all_classes = ["BRCA", "PRAD", "KIRC", "LUAD", "COAD"]
+
+
+    for j in range(deb, end, pas):
+
+        selector = SelectKBest(score_func=f_classif, k=j)
+        selected_features = selector.fit_transform(data, labels.flatten())
+        best_features = selected_features
+
+        overlaps = []
+
+        inv_cov_matrices = [[]] * len(all_classes)
+        for i in range (len(all_classes)):
+            cov_matrix = get_cov_matrix_of_class(selected_features, labels, all_classes[i])
+            invCovMatrix = invert_conv_matrix(cov_matrix)
+            inv_cov_matrices[i] = invCovMatrix
+
+        for i in range (len(all_classes)):
+            for k in range(i+1, len(all_classes)):
+                chosen_dist = "Mahalanobis"
+                overlaps.append((overlap(all_classes[i], all_classes[k], chosen_dist, selected_features, labels, inv_cov_matrices[k]), all_classes[i], all_classes[k], chosen_dist))
+
+        if j == deb :
+            best_features = selected_features
+            best_overlaps = overlaps
+        
+        else :
+            for i in range (len(overlaps)):
+                if np.mean([overlaps[i][0] for i in range(len(overlaps))]) < np.mean([best_overlaps[i][0] for i in range(len(best_overlaps))]):
+                    best_features = selected_features
+                    best_overlaps = overlaps
+
+    
+
+    
+    return best_features, best_overlaps, best_features.shape[1]
+
+def print_table_overlaps(results, title):
+    """
+    Print the overlaps in a table
+
+    Parameters
+    ----------
+        results :    list,          list of tuples with the overlap between the two classes
+    """
+
+    labels = ['BRCA', 'PRAD', 'KIRC', 'LUAD', 'COAD']
+
+    df = pd.DataFrame(index=labels, columns=labels)
+
+    for score, source, target, metric in results:
+        df.loc[source, target] = round(score, 4)
+
+    fig, ax = plt.subplots(figsize=(8, 6))
+
+    df = df.fillna("")
+    table = ax.table(cellText=df.values, rowLabels=df.index, colLabels=df.columns, cellLoc='center', loc='center')
+    table.set_fontsize(12)
+    table.scale(1.5, 1.5)
+
+    ax.axis('tight')
+    ax.axis('off')
+
+    plt.savefig('../images_rapport/'+ title + '.png', bbox_inches='tight')
+    plt.show()