Data Augmentation techniques suitable for text datasets

There are several data augmentation techniques suitable for text data that you can try besides back-translation. Here are some examples:

  1. Synonym Replacement: Replace words in the text with their synonyms. You can use the WordNet library in NLTK to find synonyms.
  2. Random Insertion: Insert random words into the text at random positions.
  3. Random Deletion: Delete random words from the text.
  4. Random Swap: Swap pairs of adjacent words at random positions in the text.

Here’s an example code that implements these data augmentation techniques:

import pandas as pd
import random
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('sample.csv')

# Select the column containing the text data
text_column = 'text'

# Create a list of the text data in the selected column
text_data = df[text_column].tolist()

# Define data augmentation functions
def synonym_replacement(text, n=1):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    new_text = ' '.join(new_words)
    return new_text

def random_insertion(text, n=1):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        random_word = random.choice(words)
        new_words.insert(random.randint(0, len(new_words)-1), random_word)
    new_text = ' '.join(new_words)
    return new_text

def random_deletion(text, p=0.2):
    words = text.split()
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    new_text = ' '.join(new_words)
    return new_text

def random_swap(text, n=1):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
    new_text = ' '.join(new_words)
    return new_text

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return set(synonyms)

# Apply data augmentation to the text data
new_text_data = []
for text in text_data:
    new_text = synonym_replacement(text) # or random_insertion(text), or random_deletion(text), or random_swap(text)
    new_text_data.append(new_text)

# Create a new DataFrame with the generated data
new_df = pd.DataFrame()
for column in df.columns:
    new_df[column] = df[column]
new_df['new_text'] = new_text_data

# Write the new DataFrame to a CSV file
new_df.to_csv('new_sample.csv', index=False)

print('New CSV file with generated data saved successfully.')

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.