s2t2 · s2t2 · Sep 17, 2023 · Sep 17, 2023 · Sep 17, 2023
diff --git a/app/classification/__init__.py b/app/classification/__init__.py
@@ -18,8 +18,9 @@
 Y_COLS_MULTICLASS = [
     # multiclass classification with categorical classes:
     "fourway_label", #"bom_overall_fourway_label", "bom_astroturf_fourway_label"
+    "sixway_fact_label" #, sixway_q_label
 ]
-Y_COLS = Y_COLS_BINARY + Y_COLS_MULTICLASS
+Y_COLS = ["sixway_fact_label"] #= Y_COLS_BINARY + Y_COLS_MULTICLASS
 
 BOT_CLASSES_MAP = {True:"Bot", False:"Human"}
 CLASSES_MAP = {
@@ -28,7 +29,7 @@
     "is_bom_astroturf": BOT_CLASSES_MAP,
     "opinion_community": {0:"Anti-Trump", 1:"Pro-Trump"},
     "is_toxic": {0: "Normal", 1: "Toxic"},
-    "is_factual": {0: "Low Quality", 1: "High Quality"},
+    "is_factual": {0: "Low-Quality", 1: "High-Quality"},
 }
 
 

diff --git a/app/classification/pipeline.py b/app/classification/pipeline.py
@@ -266,11 +266,18 @@ def plot_roc_curve_multiclass(self, fig_show=FIG_SHOW, fig_save=FIG_SAVE, height
 
         chart_data = []
         for i, class_name in enumerate(class_names):
+
             fpr, tpr, _ = roc_curve(y_test_encoded[:,i], self.y_pred_proba[:,i])
             score = auc(fpr, tpr)
+
+            try:
+                color = ORANGES[i+2]
+            except IndexError:
+                color = ORANGES[-1] # just use the same color once we run out of oranges
+
             trace = go.Scatter(x=fpr, y=tpr,
                 mode='lines',
-                line=dict(color=ORANGES[i+2], width=2),
+                line=dict(color=color, width=2),
                 name=f"'{str(class_name).title()}' vs Rest (AUC = {score.round(3)})"
             )
             chart_data.append(trace)

diff --git a/app/colors.py b/app/colors.py
@@ -20,7 +20,7 @@
 BOT_COLORS_MAP = {"Human": GREYS[3], "Bot": PURPLES[6]}
 Q_COLORS_MAP = {"Normal": GREYS[3], "Q-anon": REDS[6]}
 TOXIC_COLORS_MAP = {"Toxic": BROWNS[1], "Normal": GREYS[3]}
-FACT_COLORS_MAP = {"High Quality": GREYS[3], "Low Quality": RD_PU[4]}
+FACT_COLORS_MAP = {"High-Quality": GREYS[3], "Low-Quality": RD_PU[4]}
 
 FOURWAY_COLORS_MAP = {
     "Anti-Trump Human": BLUES[3],
@@ -29,17 +29,6 @@
     "Pro-Trump Human": REDS[3],
     "Pro-Trump Bot": REDS[6],
 }
-SIXWAY_COLORS_MAP = {
-    "Anti-Trump Human": BLUES[3],
-    "Anti-Trump Bot": BLUES[6],
-
-    "Pro-Trump Human": REDS[3],
-    "Pro-Trump Bot": REDS[6],
-
-    "Q-anon Human": REDS[4], # "Pro-Trump Q-anon Human"
-    "Q-anon Bot": REDS[7], # "Pro-Trump Q-anon Bot"
-}
-
 
 COLORS_MAP = {
     "bot_label": BOT_COLORS_MAP,
@@ -49,7 +38,7 @@
     "factual_label": FACT_COLORS_MAP,
 
     "fourway_label": FOURWAY_COLORS_MAP,
-    "sixway_label": SIXWAY_COLORS_MAP,
+
     "bom_overall_label": BOT_COLORS_MAP,
     "bom_astroturf_label": BOT_COLORS_MAP,
 }
@@ -64,8 +53,7 @@
     "q_label": ["Normal", "Q-anon"],
 
     "toxic_label": ["Normal", "Toxic"],
-    "factual_label": ["High Quality", "Low Quality"],
+    "factual_label": ["High-quality", "Low-Quality"],
 
     "fourway_label": list(FOURWAY_COLORS_MAP.keys()),
-    "sixway_label": list(SIXWAY_COLORS_MAP.keys()),
 }
diff --git a/app/dataset.py b/app/dataset.py
@@ -22,8 +22,9 @@
 
     'opinion_label', 'bot_label', 'q_label',
     "toxic_label", "factual_label",
-    'bom_overall_label', 'bom_astroturf_label', #'group_label'
-    'fourway_label', 'sixway_label', "bom_overall_fourway_label", "bom_astroturf_fourway_label"
+    'bom_overall_label', 'bom_astroturf_label',
+    'fourway_label', 'sixway_q_label', "sixway_fact_label",
+    "bom_overall_fourway_label", "bom_astroturf_fourway_label"
 ]
 
 
@@ -37,18 +38,13 @@ def __init__(self, csv_filepath=CSV_FILEPATH, label_cols=LABEL_COLS):
     def df(self):
         df = read_csv(self.csv_filepath)
 
-        df.rename(columns={"group_label": "sixway_label"}, inplace=True)
-        #print(df["sixway_label"].value_counts())
+        df.rename(columns={"group_label": "sixway_q_label"}, inplace=True)
 
-        df["fourway_label"] = df["opinion_label"] + " " + df["bot_label"]
-        #print(df["fourway_label"].value_counts())
 
         df["is_bom_overall"] = df["bom_overall"].round()
         df["is_bom_astroturf"] = df["bom_astroturf"].round()
         df["bom_overall_label"] = df["is_bom_overall"].map({1:"Bot", 0:"Human"})
         df["bom_astroturf_label"] = df["is_bom_astroturf"].map({1:"Bot", 0:"Human"})
-        df["bom_overall_fourway_label"] = df["opinion_label"] + " " + df["bom_overall_label"]
-        df["bom_astroturf_fourway_label"] = df["opinion_label"] + " " + df["bom_astroturf_label"]
 
         toxic_threshold = 0.1 # set threshold and check robustness
         df["is_toxic"] = df["avg_toxicity"] >= toxic_threshold
@@ -59,7 +55,15 @@ def df(self):
         fact_threshold = 3.0 # set threshold and check robustness
         df["is_factual"] = df["avg_fact_score"].apply(lambda score: score if isnull(score) else score >= fact_threshold)
         df["is_factual"] = df["is_factual"].map({True: 1, False :0 })
-        df["factual_label"] = df["is_factual"].map({1: "High Quality", 0 :"Low Quality" })
+        df["factual_label"] = df["is_factual"].map({1: "High-Quality", 0 :"Low-Quality" })
+
+        # COMBINATIONS
+
+        df["fourway_label"] = df["opinion_label"] + " " + df["bot_label"]
+        df["bom_overall_fourway_label"] = df["opinion_label"] + " " + df["bom_overall_label"]
+        df["bom_astroturf_fourway_label"] = df["opinion_label"] + " " + df["bom_astroturf_label"]
+
+        df["sixway_fact_label"] = df["opinion_label"] + " " + df["bot_label"] + " " + df["factual_label"]
 
         return df
 

diff --git a/app/reduction/pipeline.py b/app/reduction/pipeline.py
@@ -224,7 +224,7 @@ def plot_centroids(self, groupby_col, height=500, fig_show=FIG_SHOW, fig_save=FI
         "bot_label", "opinion_label", "bom_overall_label", "bom_astroturf_label",
         "toxic_label", "factual_label",
 
-        "fourway_label", #"sixway_label",
+        "fourway_label", #"sixway_fact_label",
                         ]:
         color_map = COLORS_MAP[groupby_col]
         category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}

diff --git a/conftest.py b/conftest.py
@@ -8,7 +8,7 @@
 
 N_USERS = 7566
 N_FEATURES = 1536 # number of embeddings returned by openai
-N_LABELS = 36 # number of label columns
+N_LABELS = 37 # number of label columns
 
 @fixture(scope="module")
 def ds():

diff --git a/index.html b/index.html
@@ -52,7 +52,6 @@ <h3><a href="results/reduced_classification/index.html">Classification Results (
             "opinion_label": "Opinion Community",
             "q_label": "Qanon Status",
             "fourway_label": "Four Group Label",
-            "sixway_label": "Six Group Label"
             }
         //var table = document.getElementById("results-table")
         //var tableBody = table.tBodies[0]
@@ -102,7 +101,7 @@ <h3><a href="results/reduced_classification/index.html">Classification Results (
         var GROUPS = ["bot_label",
             "bom_overall_label", "bom_astroturf_label",
             "opinion_label",
-            "fourway_label", //"sixway_label"
+            "fourway_label",
         ]
 
         REDUCTION_METHODS.forEach(function(reduction_method){

diff --git a/...uction/sixway_label/tsne_2_centroids.html → ..._label/logistic_regression/confusion.html b/...uction/sixway_label/tsne_2_centroids.html → ..._label/logistic_regression/confusion.html
diff --git a/results/classification/sixway_fact_label/logistic_regression/confusion.png b/results/classification/sixway_fact_label/logistic_regression/confusion.png
diff --git a/results/classification/sixway_fact_label/logistic_regression/results.json b/results/classification/sixway_fact_label/logistic_regression/results.json
@@ -0,0 +1,185 @@
+{
+    "class_names": [
+        "0",
+        "1",
+        "2",
+        "3",
+        "4",
+        "5",
+        "6",
+        "7"
+    ],
+    "class_labels": [
+        "Anti-Trump Bot High-Quality",
+        "Anti-Trump Bot Low-Quality",
+        "Anti-Trump Human High-Quality",
+        "Anti-Trump Human Low-Quality",
+        "Pro-Trump Bot High-Quality",
+        "Pro-Trump Bot Low-Quality",
+        "Pro-Trump Human High-Quality",
+        "Pro-Trump Human Low-Quality"
+    ],
+    "classification_report": {
+        "Anti-Trump Bot High-Quality": {
+            "precision": 0.6683804627249358,
+            "recall": 0.9961685823754789,
+            "f1-score": 0.8,
+            "support": 261.0
+        },
+        "Anti-Trump Bot Low-Quality": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 96.0
+        },
+        "Anti-Trump Human High-Quality": {
+            "precision": 0.8333333333333334,
+            "recall": 0.1724137931034483,
+            "f1-score": 0.28571428571428575,
+            "support": 29.0
+        },
+        "Anti-Trump Human Low-Quality": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 9.0
+        },
+        "Pro-Trump Bot High-Quality": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 10.0
+        },
+        "Pro-Trump Bot Low-Quality": {
+            "precision": 0.8939393939393939,
+            "recall": 0.9915966386554622,
+            "f1-score": 0.9402390438247011,
+            "support": 238.0
+        },
+        "Pro-Trump Human High-Quality": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 2.0
+        },
+        "Pro-Trump Human Low-Quality": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 14.0
+        },
+        "accuracy": 0.7602427921092565,
+        "macro avg": {
+            "precision": 0.2994566487497079,
+            "recall": 0.2700223767667987,
+            "f1-score": 0.2532441661923734,
+            "support": 659.0
+        },
+        "weighted avg": {
+            "precision": 0.6242360291281497,
+            "recall": 0.7602427921092565,
+            "f1-score": 0.6689872636054525,
+            "support": 659.0
+        }
+    },
+    "confusion_matrix": [
+        [
+            260,
+            0,
+            0,
+            0,
+            0,
+            1,
+            0,
+            0
+        ],
+        [
+            94,
+            0,
+            0,
+            0,
+            0,
+            2,
+            0,
+            0
+        ],
+        [
+            24,
+            0,
+            5,
+            0,
+            0,
+            0,
+            0,
+            0
+        ],
+        [
+            8,
+            0,
+            1,
+            0,
+            0,
+            0,
+            0,
+            0
+        ],
+        [
+            1,
+            0,
+            0,
+            0,
+            0,
+            9,
+            0,
+            0
+        ],
+        [
+            2,
+            0,
+            0,
+            0,
+            0,
+            236,
+            0,
+            0
+        ],
+        [
+            0,
+            0,
+            0,
+            0,
+            0,
+            2,
+            0,
+            0
+        ],
+        [
+            0,
+            0,
+            0,
+            0,
+            0,
+            14,
+            0,
+            0
+        ]
+    ],
+    "roc_auc_score": 0.8332511940989005,
+    "y_col": "sixway_fact_label",
+    "x_scaled": false,
+    "grid_search": {
+        "model_type": "LogisticRegression",
+        "k_folds": 5,
+        "param_grid": {
+            "classifier__max_iter": [
+                25,
+                1000,
+                10000
+            ]
+        },
+        "best_params": {
+            "classifier__max_iter": 1000
+        },
+        "best_score": 0.8309722151668659
+    }
+}
diff --git a/...uction/sixway_label/tsne_3_centroids.html → ..._label/logistic_regression/roc_curve.html b/...uction/sixway_label/tsne_3_centroids.html → ..._label/logistic_regression/roc_curve.html
diff --git a/results/classification/sixway_fact_label/logistic_regression/roc_curve.png b/results/classification/sixway_fact_label/logistic_regression/roc_curve.png
diff --git a/results/reduction/sixway_label/pca_2.html b/results/reduction/sixway_label/pca_2.html
diff --git a/results/reduction/sixway_label/pca_2_centroids.html b/results/reduction/sixway_label/pca_2_centroids.html
diff --git a/results/reduction/sixway_label/pca_3.html b/results/reduction/sixway_label/pca_3.html
diff --git a/results/reduction/sixway_label/pca_3_centroids.html b/results/reduction/sixway_label/pca_3_centroids.html
diff --git a/results/reduction/sixway_label/tsne_2.html b/results/reduction/sixway_label/tsne_2.html
diff --git a/results/reduction/sixway_label/tsne_3.html b/results/reduction/sixway_label/tsne_3.html
diff --git a/results/reduction/sixway_label/umap_2.html b/results/reduction/sixway_label/umap_2.html
diff --git a/results/reduction/sixway_label/umap_2_centroids.html b/results/reduction/sixway_label/umap_2_centroids.html
diff --git a/results/reduction/sixway_label/umap_3.html b/results/reduction/sixway_label/umap_3.html
diff --git a/results/reduction/sixway_label/umap_3_centroids.html b/results/reduction/sixway_label/umap_3_centroids.html