jupyter/words_syllables.ipynb at main

Files

Chang CL 4d873cd9e8 remove negative words

2025-08-24 13:55:44 +08:00

13 KiB

Raw Permalink Blame History

In [11]:

import csv

In [1]:

single_syllable_nouns = [
    "art", "ash", "axe", "bag", "ball", "bar", "bat", "bay", "bed", "bee",
    "bell", "belt", "bench", "bird", "boat", "book", "boot", "bow", "box", "boy",
    "branch", "bread", "bridge", "brush", "bucket", "bus", "bush", "cake", "can",
    "cap", "car", "card", "cart", "cat", "chain", "chair", "chalk", "cheese", "chest",
    "chicken", "child", "church", "city", "class", "clock", "cloud", "coat", "code",
    "coin", "couch", "court", "cow", "crab", "cream", "crow", "cup", "curtain", "dad",
    "day", "deck", "desk", "dog", "door", "dress", "drink", "drop", "duck", "dust",
    "ear", "earth", "egg", "eye", "face", "fact", "farm", "field", "file", "film",
    "fire", "fish", "flag", "floor", "flower", "fly", "fog", "food", "foot", "fork",
    "fox", "friend", "frog", "fruit", "game", "gate", "girl", "glass", "glove", "goat",
    "god", "gold", "grass", "grave", "green", "ground", "group", "gum", "gun", "hair",
    "hand", "hat", "head", "heart", "heat", "hill", "hole", "home", "horse", "house",
    "ice", "ink", "jacket", "jam", "jar", "job", "key", "king", "kiss", "kite",
    "knife", "lady", "lake", "lamp", "land", "law", "leaf", "leg", "letter", "light",
    "line", "lion", "list", "lock", "log", "love", "lunch", "man", "map", "mask",
    "meal", "meat", "men", "milk", "mind", "mine", "moon", "morning", "mother", "mouse",
    "mouth", "name", "neck", "night", "noise", "nose", "note", "ocean", "office", "oil",
    "orange", "page", "pain", "paint", "pan", "paper", "park", "part", "party", "path",
    "peace", "pear", "pen", "pencil", "people", "phone", "photo", "pie", "pig", "pin",
    "pipe", "place", "plane", "plant", "plate", "play", "point", "pole", "pool", "port",
    "post", "pot", "price", "prince", "queen", "race", "rain", "rat", "ring", "river",
    "road", "rock", "room", "root", "rose", "rule", "run", "sail", "salt", "sand",
    "school", "sea", "seat", "seed", "shade", "shape", "sheep", "shelf", "ship", "shirt",
    "shoe", "shop", "shot", "side", "sign", "silk", "sister", "size", "sky", "sleep",
    "smile", "smoke", "snake", "snow", "sock", "son", "song", "sound", "soup", "space",
    "speech", "spoon", "sport", "spring", "square", "star", "state", "steam", "steel",
    "step", "stick", "stone", "stop", "store", "storm", "street", "string", "student", "sun",
    "table", "tail", "tea", "teacher", "team", "test", "text", "thread", "throne", "time",
    "toe", "town", "toy", "train", "tree", "trip", "truck", "truth", "tube", "turn",
    "wall", "war", "watch", "water", "wave", "way", "week", "weight", "well", "wheel",
    "wind", "window", "wine", "wing", "winter", "wire", "wish", "woman", "wood", "word",
    "work", "world", "year", "youth"
]

In [ ]:

with open('nouns.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(single_syllable_nouns)

In [2]:

import nltk
from nltk.corpus import cmudict

In [3]:

# Download the CMU Pronouncing Dictionary
nltk.download('cmudict')

d = cmudict.dict()

def count_syllables(word):
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        return 0

[nltk_data] Downloading package cmudict to /home/changcl/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!

In [ ]:

# Test a few words
test_words = ["computer", "cat", "elephant", "dog", "important"]
for word in test_words:
    print(f"'{word}': {count_syllables(word)} syllables")

In [6]:

# Test a few words
# test_words = ["computer", "cat", "elephant", "dog", "important"]
for word in single_syllable_nouns:
    count = count_syllables(word)
    if count > 1:
        print(f"'{word}': {count} syllables")

'bucket': 2 syllables
'chicken': 2 syllables
'city': 2 syllables
'curtain': 2 syllables
'fire': 2 syllables
'flower': 2 syllables
'jacket': 2 syllables
'lady': 2 syllables
'letter': 2 syllables
'lion': 2 syllables
'morning': 2 syllables
'mother': 2 syllables
'ocean': 2 syllables
'office': 2 syllables
'orange': 2 syllables
'paper': 2 syllables
'party': 2 syllables
'pencil': 2 syllables
'people': 2 syllables
'photo': 2 syllables
'river': 2 syllables
'sister': 2 syllables
'student': 2 syllables
'table': 2 syllables
'teacher': 2 syllables
'water': 2 syllables
'window': 2 syllables
'winter': 2 syllables
'wire': 2 syllables
'woman': 2 syllables

In [7]:

not_single_syllable = []
for word in single_syllable_nouns:
    count = count_syllables(word)
    if count > 1:
        not_single_syllable.append(word)
print(not_single_syllable)

['bucket', 'chicken', 'city', 'curtain', 'fire', 'flower', 'jacket', 'lady', 'letter', 'lion', 'morning', 'mother', 'ocean', 'office', 'orange', 'paper', 'party', 'pencil', 'people', 'photo', 'river', 'sister', 'student', 'table', 'teacher', 'water', 'window', 'winter', 'wire', 'woman']

list_1 = ['apple', 'banana', 'orange', 'grape', 'kiwi']
list_2 = ['banana', 'kiwi']

# Remove items from list_1 that are in list_2
list_1 = [item for item in list_1 if item not in list_2]

print(list_1)  # Output: ['apple', 'orange', 'grape']

In [9]:

single_syllable_nouns_cleaned = [item for item in single_syllable_nouns if item not in not_single_syllable]
print(single_syllable_nouns_cleaned)

['art', 'ash', 'axe', 'bag', 'ball', 'bar', 'bat', 'bay', 'bed', 'bee', 'bell', 'belt', 'bench', 'bird', 'boat', 'book', 'boot', 'bow', 'box', 'boy', 'branch', 'bread', 'bridge', 'brush', 'bus', 'bush', 'cake', 'can', 'cap', 'car', 'card', 'cart', 'cat', 'chain', 'chair', 'chalk', 'cheese', 'chest', 'child', 'church', 'class', 'clock', 'cloud', 'coat', 'code', 'coin', 'couch', 'court', 'cow', 'crab', 'cream', 'crow', 'cup', 'dad', 'day', 'deck', 'desk', 'dog', 'door', 'dress', 'drink', 'drop', 'duck', 'dust', 'ear', 'earth', 'egg', 'eye', 'face', 'fact', 'farm', 'field', 'file', 'film', 'fish', 'flag', 'floor', 'fly', 'fog', 'food', 'foot', 'fork', 'fox', 'friend', 'frog', 'fruit', 'game', 'gate', 'girl', 'glass', 'glove', 'goat', 'god', 'gold', 'grass', 'grave', 'green', 'ground', 'group', 'gum', 'gun', 'hair', 'hand', 'hat', 'head', 'heart', 'heat', 'hill', 'hole', 'home', 'horse', 'house', 'ice', 'ink', 'jam', 'jar', 'job', 'key', 'king', 'kiss', 'kite', 'knife', 'lake', 'lamp', 'land', 'law', 'leaf', 'leg', 'light', 'line', 'list', 'lock', 'log', 'love', 'lunch', 'man', 'map', 'mask', 'meal', 'meat', 'men', 'milk', 'mind', 'mine', 'moon', 'mouse', 'mouth', 'name', 'neck', 'night', 'noise', 'nose', 'note', 'oil', 'page', 'pain', 'paint', 'pan', 'park', 'part', 'path', 'peace', 'pear', 'pen', 'phone', 'pie', 'pig', 'pin', 'pipe', 'place', 'plane', 'plant', 'plate', 'play', 'point', 'pole', 'pool', 'port', 'post', 'pot', 'price', 'prince', 'queen', 'race', 'rain', 'rat', 'ring', 'road', 'rock', 'room', 'root', 'rose', 'rule', 'run', 'sail', 'salt', 'sand', 'school', 'sea', 'seat', 'seed', 'shade', 'shape', 'sheep', 'shelf', 'ship', 'shirt', 'shoe', 'shop', 'shot', 'side', 'sign', 'silk', 'size', 'sky', 'sleep', 'smile', 'smoke', 'snake', 'snow', 'sock', 'son', 'song', 'sound', 'soup', 'space', 'speech', 'spoon', 'sport', 'spring', 'square', 'star', 'state', 'steam', 'steel', 'step', 'stick', 'stone', 'stop', 'store', 'storm', 'street', 'string', 'sun', 'tail', 'tea', 'team', 'test', 'text', 'thread', 'throne', 'time', 'toe', 'town', 'toy', 'train', 'tree', 'trip', 'truck', 'truth', 'tube', 'turn', 'wall', 'war', 'watch', 'wave', 'way', 'week', 'weight', 'well', 'wheel', 'wind', 'wine', 'wing', 'wish', 'wood', 'word', 'work', 'world', 'year', 'youth']

In [12]:

with open('cleaned_nouns.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(single_syllable_nouns_cleaned)

In [14]:

from textblob import TextBlob

# Alternative method using sentiment analysis (requires: pip install textblob)
def is_negative(word):
    # Simple sentiment check - words with negative polarity
    analysis = TextBlob(word)
    return analysis.sentiment.polarity < -0.1

In [20]:

for word in single_syllable_nouns_cleaned:
    if is_negative(word):
        print(word)

cow
game
green

In [15]:

sentiment_filtered_nouns = [word for word in single_syllable_nouns_cleaned if not is_negative(word)]

13 KiB Raw Permalink Blame History

13 KiB

Raw Permalink Blame History