I finally found out why my Sentiment Classification BERT Model was keep on overfitting to the dataset. The Transformer block I made wasn’t capable of handling the padding tokens, and were keep on learning information from it. To address problem, I decided to temporarily use Keras’ built-in MultiHeadAttention Layer.
This is the BERT module. I plan to extract the encoder layers for transfer learning.
def bert_module(query, key, value, i, mask=None):
attention_output = MultiHeadAttention(
num_heads=config.NUM_HEAD,
key_dim=config.EMBED_DIM // config.NUM_HEAD,
name="encoder_{}_multiheadattention".format(i),
)(query, key, value, attention_mask=mask)
attention_output = Dropout(0.1, name="encoder_{}_att_dropout".format(i))(
attention_output
)
attention_output = LayerNormalization(
epsilon=1e-6, name="encoder_{}_att_layernormalization".format(i)
)(query + attention_output)
ffn = Sequential(
[
Dense(config.FF_DIM, activation="relu"),
Dense(config.EMBED_DIM),
],
name="encoder_{}_ffn".format(i),
)
ffn_output = ffn(attention_output)
ffn_output = Dropout(0.1, name="encoder_{}_ffn_dropout".format(i))(
ffn_output
)
sequence_output = LayerNormalization(
epsilon=1e-6, name="encoder_{}_ffn_layernormalization".format(i)
)(attention_output + ffn_output)
return sequence_output
This is my full Sentiment Analysis BERT Modle
def bert_module(query, key, value, i, mask=None):
attention_output = MultiHeadAttention(
num_heads=config.NUM_HEAD,
key_dim=config.EMBED_DIM // config.NUM_HEAD,
name="encoder_{}_multiheadattention".format(i),
)(query, key, value, attention_mask=mask)
attention_output = Dropout(0.1, name="encoder_{}_att_dropout".format(i))(
attention_output
)
attention_output = LayerNormalization(
epsilon=1e-6, name="encoder_{}_att_layernormalization".format(i)
)(query + attention_output)
ffn = Sequential(
[
Dense(config.FF_DIM, activation="relu"),
Dense(config.EMBED_DIM),
],
name="encoder_{}_ffn".format(i),
)
ffn_output = ffn(attention_output)
ffn_output = Dropout(0.1, name="encoder_{}_ffn_dropout".format(i))(
ffn_output
)
sequence_output = LayerNormalization(
epsilon=1e-6, name="encoder_{}_ffn_layernormalization".format(i)
)(attention_output + ffn_output)
return sequence_output
loss_fn = keras.losses.SparseCategoricalCrossentropy(reduction=None)
loss_tracker = keras.metrics.Mean(name="loss")
class MaskedLanguageModel(keras.Model):
def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
loss = loss_fn(y, y_pred, sample_weight)
loss_tracker.update_state(loss, sample_weight=sample_weight)
return keras.ops.sum(loss)
def compute_metrics(self, x, y, y_pred, sample_weight):
return {"loss": loss_tracker.result()}
@property
def metrics(self):
return [loss_tracker]
def create_masked_language_bert_model():
inputs = Input(shape=(config.MAX_LEN,), name="Input")
word_embeddings = Embedding(
config.VOCAB_SIZE,
config.EMBED_DIM,
mask_zero=True,
name="word_embedding"
)(inputs)
position_embeddings = keras_hub.layers.PositionEmbedding(
sequence_length=config.MAX_LEN,
name="position_embedding"
)(word_embeddings)
embeddings = word_embeddings + position_embeddings
encoder_output = embeddings
for i in range(1, 4+1):
encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)
mlm_output = Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
encoder_output
)
mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")
optimizer = keras.optimizers.Adam(learning_rate=config.LR)
mlm_model.compile(optimizer=optimizer)
return mlm_model
id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}
sample_tokens = vectorize_layer(["Google wont [mask] replacing our news headlines with terrible AI"])
bert_masked_model = create_masked_language_bert_model()
After pre-training the model, I extracted layers from input to the last encoder layer and added dense layers to create a sentiment classification model.
pretrained_bert_model = keras.Model(
bert_masked_model.input, bert_masked_model.get_layer("encoder_4_ffn_layernormalization").output
)
pretrained_bert_model.trainable = False
def create_classifier_bert_model():
inputs = layers.Input((config.MAX_LEN,), dtype="int64")
sequence_output = pretrained_bert_model(inputs)
pooled_output = layers.Lambda(lambda x: x[:, 0, :])(sequence_output)
x = layers.BatchNormalization()(pooled_output)
x = layers.Dense(128, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.BatchNormalization()(x)
outputs = layers.Dense(3, activation="softmax")(x)
classifier_model = Model(inputs, outputs, name="classification")
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
classifier_model.compile(
optimizer=optimizer,
loss="sparse_categorical_crossentropy",
metrics=["accuracy"]
)
return classifier_model
classifer_model = create_classifier_bert_model()
Last time, the dataset was imbalanced, so I undersampled it. However, that resulted in a dataset too small, so this time, I used nlpaug to oversample my dataset.
aug = naw.SynonymAug(aug_src='wordnet')
def augment_text(df, target_count):
current_count = len(df)
if current_count >= target_count:
return df
aug_samples = []
needed = target_count - current_count
label_name = df.iloc[0]['Output']
print(f"Augmenting '{label_name}' class: {current_count} -> {target_count}...")
while len(aug_samples) < needed:
for text in df['Input']:
if len(aug_samples) >= needed:
break
augmented_text = aug.augment(text)[0]
aug_samples.append(augmented_text)
df_aug = pd.DataFrame({'Input': aug_samples, 'Output': [label_name] * len(aug_samples)})
return pd.concat([df, df_aug])
target_n = 2000
df_neg_final = augment_text(df_neg_down, target_n)
df_neu_final = augment_text(df_neu_down, target_n)
df_pos_final = augment_text(df_pos_down, target_n)
final_df = pd.concat([df_neg_final, df_neu_final, df_pos_final]).sample(frac=1, random_state=42).reset_index(drop=True)
X_encoded = encode(final_df['Input'])
le = LabelEncoder()
y_encoded = le.fit_transform(final_df['Output'])
X_train, X_test, y_train, y_test = train_test_split(
X_encoded, y_encoded, test_size=0.1, random_state=42
)
train_classifier_ds = (tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train)).batch(32))
test_classifier_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)
I trained the model with the pre-trained layers frozen, then then fine-tuned the model again.
pretrained_bert_model.trainable = False
classifer_model.fit(
train_classifier_ds,
epochs=5,
validation_data=test_classifier_ds,
)
pretrained_bert_model.trainable = True
optimizer = keras.optimizers.Adam()
classifer_model.compile(
optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
classifer_model.fit(
train_classifier_ds,
epochs=5,
validation_data=test_classifier_ds,
)
This solved the problem of the model generating the same output regardless of the input, but still isn’t really accurate.
I tested the model with the following headlines.
test_headlines = [
# Negative
"Apple shares plummet after disappointing iPhone sales report",
"Lawsuit filed against Apple over battery throttling issues",
"Supply chain disruptions expected to delay new Mac releases",
# Positive
"Apple's quarterly profit exceeds analyst expectations",
"Revolutionary AI features announced for upcoming iOS update",
"Warren Buffett increases stake in Apple, citing strong ecosystem",
# Neutral
"Apple to hold its annual developer conference in June",
"New software update available for Apple Watch users",
"Tim Cook delivers keynote speech at university graduation",
# Complicated
"Apple is down today, but long-term outlook remains bright",
"The new iPhone is not as bad as critics predicted",
"Apple's competitor Samsung sees record-breaking growth",
"Is Apple losing its innovative edge?",
]
This was the output:
[1879 637 8661 12 4977 7374 695 52 0 0 0 0 0 0
0 0]
Input Headline: "Apple shares plummet after disappointing iPhone sales report"
Top Prediction: neutral (46.39%)
------------------------------
Class Probabilities:
- neutral : 46.39% #########
- negative : 30.54% ######
- positive : 23.07% ####
==================================================
[ 3762 15027 45 1879 8 5033 1 631 0 0 0 0
0 0 0 0]
Input Headline: "Lawsuit filed against Apple over battery throttling issues"
Top Prediction: neutral (57.48%)
------------------------------
Class Probabilities:
- neutral : 57.48% ###########
- positive : 21.28% ####
- negative : 21.24% ####
==================================================
[1071 5216 8391 419 2 731 13 8233 1493 0 0 0 0 0
0 0]
Input Headline: "Supply chain disruptions expected to delay new Mac releases"
Top Prediction: neutral (57.53%)
------------------------------
Class Probabilities:
- neutral : 57.53% ###########
- positive : 21.26% ####
- negative : 21.21% ####
==================================================
[ 6299 9813 647 10747 2601 4761 0 0 0 0 0 0
0 0 0 0]
Input Headline: "Apple's quarterly profit exceeds analyst expectations"
Top Prediction: positive (57.56%)
------------------------------
Class Probabilities:
- positive : 57.56% ###########
- neutral : 21.23% ####
- negative : 21.21% ####
==================================================
[12336 6231 8969 1461 4 11102 1 1501 0 0 0 0
0 0 0 0]
Input Headline: "Revolutionary AI features announced for upcoming iOS update"
Top Prediction: positive (39.86%)
------------------------------
Class Probabilities:
- positive : 39.86% #######
- neutral : 35.40% #######
- negative : 24.75% ####
==================================================
[ 3387 20988 2317 3766 3 1879 14635 559 16327 0 0 0
0 0 0 0]
Input Headline: "Warren Buffett increases stake in Apple, citing strong ecosystem"
Top Prediction: positive (57.14%)
------------------------------
Class Probabilities:
- positive : 57.14% ###########
- neutral : 21.56% ####
- negative : 21.30% ####
==================================================
[1879 2 337 435 3106 2488 817 3 1109 0 0 0 0 0
0 0]
Input Headline: "Apple to hold its annual developer conference in June"
Top Prediction: neutral (53.63%)
------------------------------
Class Probabilities:
- neutral : 53.63% ##########
- positive : 24.41% ####
- negative : 21.96% ####
==================================================
[ 13 6621 1501 3838 4 1879 923 2940 0 0 0 0 0 0
0 0]
Input Headline: "New software update available for Apple Watch users"
Top Prediction: neutral (57.52%)
------------------------------
Class Probabilities:
- neutral : 57.52% ###########
- positive : 21.26% ####
- negative : 21.22% ####
==================================================
[ 1374 2319 1671 1 1556 10 1049 11390 0 0 0 0
0 0 0 0]
Input Headline: "Tim Cook delivers keynote speech at university graduation"
Top Prediction: negative (49.19%)
------------------------------
Class Probabilities:
- negative : 49.19% #########
- neutral : 28.00% #####
- positive : 22.80% ####
==================================================
[1879 50 49 2134 300 1 1711 641 4203 0 0 0 0 0
0 0]
Input Headline: "Apple is down today, but long-term outlook remains bright"
Top Prediction: negative (57.50%)
------------------------------
Class Probabilities:
- negative : 57.50% ###########
- neutral : 21.27% ####
- positive : 21.23% ####
==================================================
[ 7 13 7374 50 31 17 1117 17 3195 2129 0 0 0 0
0 0]
Input Headline: "The new iPhone is not as bad as critics predicted"
Top Prediction: neutral (55.20%)
------------------------------
Class Probabilities:
- neutral : 55.20% ###########
- negative : 23.09% ####
- positive : 21.71% ####
==================================================
[ 6299 13035 8602 904 1 642 0 0 0 0 0 0
0 0 0 0]
Input Headline: "Apple's competitor Samsung sees record-breaking growth"
Top Prediction: positive (57.04%)
------------------------------
Class Probabilities:
- positive : 57.04% ###########
- negative : 21.62% ####
- neutral : 21.35% ####
==================================================
[ 50 1879 1920 435 14010 1553 0 0 0 0 0 0
0 0 0 0]
Input Headline: "Is Apple losing its innovative edge?"
Top Prediction: positive (57.36%)
------------------------------
Class Probabilities:
- positive : 57.36% ###########
- neutral : 21.34% ####
- negative : 21.30% ####
==================================================
It was clear that the model was making decisions upon strong words like “Good”, “Bad”, or “Innovation”. (e.g. The model predicted positive for : “Is Apple losing its innovative edge?”)
Then I used GoogleNews API to evaluate the model’s performance on actual news headlines.
gn = GoogleNews(lang = "en", country = "US")
google_news = gn.search('Apple', when='7d')
positive_count = 0
negative_count = 0
neutral_count = 0
for entry in google_news['entries']:
result = predict_and_print(entry['title'], classifer_model)
if result is not None:
if ("positive" in result):
positive_count += 1
elif ("negative" in result):
negative_count += 1
elif ("neutral" in result):
neutral_count += 1
print("\n" + "="*50 + "\n")
print(f"Positive Count: {positive_count}")
print(f"Negative Count: {negative_count}")
print(f"Neutral Count: {neutral_count}")
Positive Count: 14
Negative Count: 10
Neutral Count: 76
Input Headline: "Apple introduces iPhone 17e - Apple"
Top Prediction: positive (50.14%)
------------------------------
Class Probabilities:
- positive : 50.14% ##########
- neutral : 27.35% #####
- negative : 22.51% ####
==================================================
Input Headline: "I’m most excited about Apple’s affordable MacBook, with one concern - 9to5Mac"
Top Prediction: negative (47.02%)
------------------------------
Class Probabilities:
- negative : 47.02% #########
- positive : 29.35% #####
- neutral : 23.63% ####
==================================================
Input Headline: "Apple Is in Its Affordable Era. Sort Of. - The New York Times"
Top Prediction: neutral (57.55%)
------------------------------
Class Probabilities:
- neutral : 57.55% ###########
- positive : 21.24% ####
- negative : 21.21% ####
==================================================
Input Headline: "MacBook Neo hands-on: Apple build quality at a substantially lower price - Ars Technica"
Top Prediction: negative (57.47%)
------------------------------
Class Probabilities:
- negative : 57.47% ###########
- positive : 21.29% ####
- neutral : 21.24% ####
==================================================
Input Headline: "Apple Just Launched an Entire Lineup of AI-Powered Macs, iPhones, and iPads - eWeek"
Top Prediction: neutral (57.52%)
------------------------------
Class Probabilities:
- neutral : 57.52% ###########
- positive : 21.26% ####
- negative : 21.22% ####
==================================================
Input Headline: "Apple launches lower cost iPhone 17e and a new iPad Air powered by its M4 chip - CNBC"
Top Prediction: negative (47.34%)
------------------------------
Class Probabilities:
- negative : 47.34% #########
- positive : 28.93% #####
- neutral : 23.73% ####
==================================================
Input Headline: "Apple debuts $599 iPhone 17e, more powerful iPad Airs - Yahoo Finance"
Top Prediction: neutral (57.47%)
------------------------------
Class Probabilities:
- neutral : 57.47% ###########
- positive : 21.30% ####
- negative : 21.23% ####
==================================================
Input Headline: "A new cheaper MacBook just dropped — here’s everything you need to know - NBC News"
Top Prediction: neutral (55.68%)
------------------------------
Class Probabilities:
- neutral : 55.68% ###########
- negative : 22.68% ####
- positive : 21.63% ####
==================================================
Input Headline: "Apple Just Released a Slew of Products Under $1,000 - Esquire"
Top Prediction: neutral (57.54%)
------------------------------
Class Probabilities:
- neutral : 57.54% ###########
- positive : 21.24% ####
- negative : 21.21% ####
==================================================
Input Headline: "Apple’s New MacBook Air and MacBook Pro Have New Chips, More Storage, and Higher Prices - WIRED"
Top Prediction: neutral (56.79%)
------------------------------
Class Probabilities:
- neutral : 56.79% ###########
- positive : 21.84% ####
- negative : 21.37% ####
==================================================
Input Headline: "Apple announces its cheapest-ever new MacBook - CNN"
Top Prediction: neutral (56.87%)
------------------------------
Class Probabilities:
- neutral : 56.87% ###########
- positive : 21.78% ####
- negative : 21.35% ####
==================================================
Input Headline: "Apple’s Touch MacBook Will Stop Well Short of a Mac-iPad Hybrid - Bloomberg"
Top Prediction: neutral (56.71%)
------------------------------
Class Probabilities:
- neutral : 56.71% ###########
- positive : 21.80% ####
- negative : 21.49% ####
==================================================
Input Headline: "Apple’s MacBook Neo is $599? Whoa. - Macworld"
Top Prediction: neutral (54.91%)
------------------------------
Class Probabilities:
- neutral : 54.91% ##########
- positive : 23.25% ####
- negative : 21.84% ####
==================================================
Input Headline: "Apple launches iPhone 17e at $599, boosts base storage to 256 gigabytes - Reuters"
Top Prediction: neutral (57.46%)
------------------------------
Class Probabilities:
- neutral : 57.46% ###########
- positive : 21.31% ####
- negative : 21.23% ####
==================================================
Input Headline: "Whoops! Apple accidentally reveals ‘MacBook Neo’ - Macworld"
Top Prediction: neutral (40.15%)
------------------------------
Class Probabilities:
- neutral : 40.15% ########
- positive : 35.83% #######
- negative : 24.02% ####
==================================================
Input Headline: "Apple unveils lower cost iPhone 17e, raises prices on MacBooks - Fox Business"
Top Prediction: negative (57.53%)
------------------------------
Class Probabilities:
- negative : 57.53% ###########
- positive : 21.25% ####
- neutral : 21.22% ####
==================================================
Input Headline: "Formula 1 is live on Apple TV this weekend, here are the details - 9to5Mac"
Top Prediction: neutral (57.55%)
------------------------------
Class Probabilities:
- neutral : 57.55% ###########
- positive : 21.24% ####
- negative : 21.21% ####
==================================================
Input Headline: "The MacBook Neo May Be Apple’s Most Consequential New Product in a Decade - inc.com"
Top Prediction: neutral (57.28%)
------------------------------
Class Probabilities:
- neutral : 57.28% ###########
- positive : 21.45% ####
- negative : 21.27% ####
==================================================
Input Headline: "Eva’s Apple #17: A dominant outlook - Flat Hat News"
Top Prediction: negative (57.54%)
------------------------------
Class Probabilities:
- negative : 57.54% ###########
- positive : 21.24% ####
- neutral : 21.23% ####
==================================================
Input Headline: "Obituary | Jeanan Helen Graham of Apple Valley, Minnesota - White Funeral Homes"
Top Prediction: neutral (57.50%)
------------------------------
Class Probabilities:
- neutral : 57.50% ###########
- positive : 21.28% ####
- negative : 21.22% ####
==================================================
Input Headline: "Apple Music Coming to Chocolate Bars - MacRumors"
Top Prediction: neutral (50.83%)
------------------------------
Class Probabilities:
- neutral : 50.83% ##########
- positive : 26.74% #####
- negative : 22.43% ####
==================================================
Input Headline: "Apple Launches $599 MacBook Neo, Threatening Windows PC Market - Bloomberg"
Top Prediction: neutral (56.85%)
------------------------------
Class Probabilities:
- neutral : 56.85% ###########
- positive : 21.74% ####
- negative : 21.42% ####
==================================================
Input Headline: "Apple’s new products add C1X chip for three unique advantages - 9to5Mac"
Top Prediction: neutral (56.73%)
------------------------------
Class Probabilities:
- neutral : 56.73% ###########
- positive : 21.89% ####
- negative : 21.38% ####
==================================================
Input Headline: "Apple introduces the $599 iPhone 17e with MagSafe and twice the storage - Engadget"
Top Prediction: neutral (57.48%)
------------------------------
Class Probabilities:
- neutral : 57.48% ###########
- positive : 21.30% ####
- negative : 21.23% ####
==================================================
Input Headline: "Apple announces the iPhone 17E - The Verge"
Top Prediction: neutral (57.12%)
------------------------------
Class Probabilities:
- neutral : 57.12% ###########
- positive : 21.57% ####
- negative : 21.30% ####
==================================================
Input Headline: "Apple launches new generation of Studio Display - 9to5Mac"
Top Prediction: neutral (57.54%)
------------------------------
Class Probabilities:
- neutral : 57.54% ###########
- positive : 21.25% ####
- negative : 21.21% ####
==================================================
Input Headline: "Meet the MacBook Neo, Apple’s colorful answer to the Chromebook, starting at $599 - TechCrunch"
Top Prediction: neutral (47.87%)
------------------------------
Class Probabilities:
- neutral : 47.87% #########
- positive : 28.94% #####
- negative : 23.19% ####
==================================================
This time, the model predicted ‘neutral’ most of the times (76/100), followed by positive (14/100), then negative (10/100).