Data Augmentation techniques suitable for text datasets
There are several data augmentation techniques suitable for text data that you can try besides back-translation. Here are some examples:
- Synonym Replacement: Replace words in the text with their synonyms. You can use the
WordNet
library in NLTK to find synonyms. - Random Insertion: Insert random words into the text at random positions.
- Random Deletion: Delete random words from the text.
- Random Swap: Swap pairs of adjacent words at random positions in the text.
Here’s an example code that implements these data augmentation techniques:
import pandas as pd
import random
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('sample.csv')
# Select the column containing the text data
text_column = 'text'
# Create a list of the text data in the selected column
text_data = df[text_column].tolist()
# Define data augmentation functions
def synonym_replacement(text, n=1):
words = text.split()
new_words = words.copy()
random_word_list = list(set([word for word in words if word not in stop_words]))
random.shuffle(random_word_list)
num_replaced = 0
for random_word in random_word_list:
synonyms = get_synonyms(random_word)
if len(synonyms) >= 1:
synonym = random.choice(synonyms)
new_words = [synonym if word == random_word else word for word in new_words]
num_replaced += 1
if num_replaced >= n:
break
new_text = ' '.join(new_words)
return new_text
def random_insertion(text, n=1):
words = text.split()
new_words = words.copy()
for _ in range(n):
random_word = random.choice(words)
new_words.insert(random.randint(0, len(new_words)-1), random_word)
new_text = ' '.join(new_words)
return new_text
def random_deletion(text, p=0.2):
words = text.split()
new_words = []
for word in words:
r = random.uniform(0, 1)
if r > p:
new_words.append(word)
new_text = ' '.join(new_words)
return new_text
def random_swap(text, n=1):
words = text.split()
new_words = words.copy()
for _ in range(n):
idx1, idx2 = random.sample(range(len(words)), 2)
new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
new_text = ' '.join(new_words)
return new_text
def get_synonyms(word):
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
return set(synonyms)
# Apply data augmentation to the text data
new_text_data = []
for text in text_data:
new_text = synonym_replacement(text) # or random_insertion(text), or random_deletion(text), or random_swap(text)
new_text_data.append(new_text)
# Create a new DataFrame with the generated data
new_df = pd.DataFrame()
for column in df.columns:
new_df[column] = df[column]
new_df['new_text'] = new_text_data
# Write the new DataFrame to a CSV file
new_df.to_csv('new_sample.csv', index=False)
print('New CSV file with generated data saved successfully.')