remove not single syllable

This commit is contained in:
Chang CL
2025-08-24 13:47:23 +08:00
parent 52683bacdd
commit cbeb11d0f1
4 changed files with 856 additions and 0 deletions

264
words_syllables.ipynb Normal file
View File

@@ -0,0 +1,264 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"id": "77365834-52b5-4443-8cc8-4ffdf0a847ba",
"metadata": {},
"outputs": [],
"source": [
"import csv"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d1c44d5e-ed2f-4122-b2ac-fcb9dca2358f",
"metadata": {},
"outputs": [],
"source": [
"single_syllable_nouns = [\n",
" \"art\", \"ash\", \"axe\", \"bag\", \"ball\", \"bar\", \"bat\", \"bay\", \"bed\", \"bee\",\n",
" \"bell\", \"belt\", \"bench\", \"bird\", \"boat\", \"book\", \"boot\", \"bow\", \"box\", \"boy\",\n",
" \"branch\", \"bread\", \"bridge\", \"brush\", \"bucket\", \"bus\", \"bush\", \"cake\", \"can\",\n",
" \"cap\", \"car\", \"card\", \"cart\", \"cat\", \"chain\", \"chair\", \"chalk\", \"cheese\", \"chest\",\n",
" \"chicken\", \"child\", \"church\", \"city\", \"class\", \"clock\", \"cloud\", \"coat\", \"code\",\n",
" \"coin\", \"couch\", \"court\", \"cow\", \"crab\", \"cream\", \"crow\", \"cup\", \"curtain\", \"dad\",\n",
" \"day\", \"deck\", \"desk\", \"dog\", \"door\", \"dress\", \"drink\", \"drop\", \"duck\", \"dust\",\n",
" \"ear\", \"earth\", \"egg\", \"eye\", \"face\", \"fact\", \"farm\", \"field\", \"file\", \"film\",\n",
" \"fire\", \"fish\", \"flag\", \"floor\", \"flower\", \"fly\", \"fog\", \"food\", \"foot\", \"fork\",\n",
" \"fox\", \"friend\", \"frog\", \"fruit\", \"game\", \"gate\", \"girl\", \"glass\", \"glove\", \"goat\",\n",
" \"god\", \"gold\", \"grass\", \"grave\", \"green\", \"ground\", \"group\", \"gum\", \"gun\", \"hair\",\n",
" \"hand\", \"hat\", \"head\", \"heart\", \"heat\", \"hill\", \"hole\", \"home\", \"horse\", \"house\",\n",
" \"ice\", \"ink\", \"jacket\", \"jam\", \"jar\", \"job\", \"key\", \"king\", \"kiss\", \"kite\",\n",
" \"knife\", \"lady\", \"lake\", \"lamp\", \"land\", \"law\", \"leaf\", \"leg\", \"letter\", \"light\",\n",
" \"line\", \"lion\", \"list\", \"lock\", \"log\", \"love\", \"lunch\", \"man\", \"map\", \"mask\",\n",
" \"meal\", \"meat\", \"men\", \"milk\", \"mind\", \"mine\", \"moon\", \"morning\", \"mother\", \"mouse\",\n",
" \"mouth\", \"name\", \"neck\", \"night\", \"noise\", \"nose\", \"note\", \"ocean\", \"office\", \"oil\",\n",
" \"orange\", \"page\", \"pain\", \"paint\", \"pan\", \"paper\", \"park\", \"part\", \"party\", \"path\",\n",
" \"peace\", \"pear\", \"pen\", \"pencil\", \"people\", \"phone\", \"photo\", \"pie\", \"pig\", \"pin\",\n",
" \"pipe\", \"place\", \"plane\", \"plant\", \"plate\", \"play\", \"point\", \"pole\", \"pool\", \"port\",\n",
" \"post\", \"pot\", \"price\", \"prince\", \"queen\", \"race\", \"rain\", \"rat\", \"ring\", \"river\",\n",
" \"road\", \"rock\", \"room\", \"root\", \"rose\", \"rule\", \"run\", \"sail\", \"salt\", \"sand\",\n",
" \"school\", \"sea\", \"seat\", \"seed\", \"shade\", \"shape\", \"sheep\", \"shelf\", \"ship\", \"shirt\",\n",
" \"shoe\", \"shop\", \"shot\", \"side\", \"sign\", \"silk\", \"sister\", \"size\", \"sky\", \"sleep\",\n",
" \"smile\", \"smoke\", \"snake\", \"snow\", \"sock\", \"son\", \"song\", \"sound\", \"soup\", \"space\",\n",
" \"speech\", \"spoon\", \"sport\", \"spring\", \"square\", \"star\", \"state\", \"steam\", \"steel\",\n",
" \"step\", \"stick\", \"stone\", \"stop\", \"store\", \"storm\", \"street\", \"string\", \"student\", \"sun\",\n",
" \"table\", \"tail\", \"tea\", \"teacher\", \"team\", \"test\", \"text\", \"thread\", \"throne\", \"time\",\n",
" \"toe\", \"town\", \"toy\", \"train\", \"tree\", \"trip\", \"truck\", \"truth\", \"tube\", \"turn\",\n",
" \"wall\", \"war\", \"watch\", \"water\", \"wave\", \"way\", \"week\", \"weight\", \"well\", \"wheel\",\n",
" \"wind\", \"window\", \"wine\", \"wing\", \"winter\", \"wire\", \"wish\", \"woman\", \"wood\", \"word\",\n",
" \"work\", \"world\", \"year\", \"youth\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7b08367-f320-459e-9dda-6392e533e979",
"metadata": {},
"outputs": [],
"source": [
"with open('nouns.csv', 'w', newline='') as csvfile:\n",
" writer = csv.writer(csvfile)\n",
" writer.writerows(single_syllable_nouns)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1dcce87e-6b3f-4e45-b28c-499bbe1d33c9",
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.corpus import cmudict"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "86f80604-fb54-46bd-ab2a-5331ec7e5411",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package cmudict to /home/changcl/nltk_data...\n",
"[nltk_data] Package cmudict is already up-to-date!\n"
]
}
],
"source": [
"# Download the CMU Pronouncing Dictionary\n",
"nltk.download('cmudict')\n",
"\n",
"d = cmudict.dict()\n",
"\n",
"def count_syllables(word):\n",
" try:\n",
" return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]\n",
" except KeyError:\n",
" return 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff7f7385-d2b4-439b-9079-6de0775b9435",
"metadata": {},
"outputs": [],
"source": [
"# Test a few words\n",
"test_words = [\"computer\", \"cat\", \"elephant\", \"dog\", \"important\"]\n",
"for word in test_words:\n",
" print(f\"'{word}': {count_syllables(word)} syllables\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "51222d43-baaa-48ed-8b9f-58fc22bbe769",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'bucket': 2 syllables\n",
"'chicken': 2 syllables\n",
"'city': 2 syllables\n",
"'curtain': 2 syllables\n",
"'fire': 2 syllables\n",
"'flower': 2 syllables\n",
"'jacket': 2 syllables\n",
"'lady': 2 syllables\n",
"'letter': 2 syllables\n",
"'lion': 2 syllables\n",
"'morning': 2 syllables\n",
"'mother': 2 syllables\n",
"'ocean': 2 syllables\n",
"'office': 2 syllables\n",
"'orange': 2 syllables\n",
"'paper': 2 syllables\n",
"'party': 2 syllables\n",
"'pencil': 2 syllables\n",
"'people': 2 syllables\n",
"'photo': 2 syllables\n",
"'river': 2 syllables\n",
"'sister': 2 syllables\n",
"'student': 2 syllables\n",
"'table': 2 syllables\n",
"'teacher': 2 syllables\n",
"'water': 2 syllables\n",
"'window': 2 syllables\n",
"'winter': 2 syllables\n",
"'wire': 2 syllables\n",
"'woman': 2 syllables\n"
]
}
],
"source": [
"# Test a few words\n",
"# test_words = [\"computer\", \"cat\", \"elephant\", \"dog\", \"important\"]\n",
"for word in single_syllable_nouns:\n",
" count = count_syllables(word)\n",
" if count > 1:\n",
" print(f\"'{word}': {count} syllables\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "218a5ad4-33b7-4e73-af1b-ba8c6303f012",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bucket', 'chicken', 'city', 'curtain', 'fire', 'flower', 'jacket', 'lady', 'letter', 'lion', 'morning', 'mother', 'ocean', 'office', 'orange', 'paper', 'party', 'pencil', 'people', 'photo', 'river', 'sister', 'student', 'table', 'teacher', 'water', 'window', 'winter', 'wire', 'woman']\n"
]
}
],
"source": [
"not_single_syllable = []\n",
"for word in single_syllable_nouns:\n",
" count = count_syllables(word)\n",
" if count > 1:\n",
" not_single_syllable.append(word)\n",
"print(not_single_syllable)"
]
},
{
"cell_type": "markdown",
"id": "629d364b-9120-4615-8e04-8704a9ccddf6",
"metadata": {},
"source": [
"```\n",
"list_1 = ['apple', 'banana', 'orange', 'grape', 'kiwi']\n",
"list_2 = ['banana', 'kiwi']\n",
"\n",
"# Remove items from list_1 that are in list_2\n",
"list_1 = [item for item in list_1 if item not in list_2]\n",
"\n",
"print(list_1) # Output: ['apple', 'orange', 'grape']\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5c2a24e2-f027-40c9-aca2-ddb8a1a4d969",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['art', 'ash', 'axe', 'bag', 'ball', 'bar', 'bat', 'bay', 'bed', 'bee', 'bell', 'belt', 'bench', 'bird', 'boat', 'book', 'boot', 'bow', 'box', 'boy', 'branch', 'bread', 'bridge', 'brush', 'bus', 'bush', 'cake', 'can', 'cap', 'car', 'card', 'cart', 'cat', 'chain', 'chair', 'chalk', 'cheese', 'chest', 'child', 'church', 'class', 'clock', 'cloud', 'coat', 'code', 'coin', 'couch', 'court', 'cow', 'crab', 'cream', 'crow', 'cup', 'dad', 'day', 'deck', 'desk', 'dog', 'door', 'dress', 'drink', 'drop', 'duck', 'dust', 'ear', 'earth', 'egg', 'eye', 'face', 'fact', 'farm', 'field', 'file', 'film', 'fish', 'flag', 'floor', 'fly', 'fog', 'food', 'foot', 'fork', 'fox', 'friend', 'frog', 'fruit', 'game', 'gate', 'girl', 'glass', 'glove', 'goat', 'god', 'gold', 'grass', 'grave', 'green', 'ground', 'group', 'gum', 'gun', 'hair', 'hand', 'hat', 'head', 'heart', 'heat', 'hill', 'hole', 'home', 'horse', 'house', 'ice', 'ink', 'jam', 'jar', 'job', 'key', 'king', 'kiss', 'kite', 'knife', 'lake', 'lamp', 'land', 'law', 'leaf', 'leg', 'light', 'line', 'list', 'lock', 'log', 'love', 'lunch', 'man', 'map', 'mask', 'meal', 'meat', 'men', 'milk', 'mind', 'mine', 'moon', 'mouse', 'mouth', 'name', 'neck', 'night', 'noise', 'nose', 'note', 'oil', 'page', 'pain', 'paint', 'pan', 'park', 'part', 'path', 'peace', 'pear', 'pen', 'phone', 'pie', 'pig', 'pin', 'pipe', 'place', 'plane', 'plant', 'plate', 'play', 'point', 'pole', 'pool', 'port', 'post', 'pot', 'price', 'prince', 'queen', 'race', 'rain', 'rat', 'ring', 'road', 'rock', 'room', 'root', 'rose', 'rule', 'run', 'sail', 'salt', 'sand', 'school', 'sea', 'seat', 'seed', 'shade', 'shape', 'sheep', 'shelf', 'ship', 'shirt', 'shoe', 'shop', 'shot', 'side', 'sign', 'silk', 'size', 'sky', 'sleep', 'smile', 'smoke', 'snake', 'snow', 'sock', 'son', 'song', 'sound', 'soup', 'space', 'speech', 'spoon', 'sport', 'spring', 'square', 'star', 'state', 'steam', 'steel', 'step', 'stick', 'stone', 'stop', 'store', 'storm', 'street', 'string', 'sun', 'tail', 'tea', 'team', 'test', 'text', 'thread', 'throne', 'time', 'toe', 'town', 'toy', 'train', 'tree', 'trip', 'truck', 'truth', 'tube', 'turn', 'wall', 'war', 'watch', 'wave', 'way', 'week', 'weight', 'well', 'wheel', 'wind', 'wine', 'wing', 'wish', 'wood', 'word', 'work', 'world', 'year', 'youth']\n"
]
}
],
"source": [
"single_syllable_nouns_cleaned = [item for item in single_syllable_nouns if item not in not_single_syllable]\n",
"print(single_syllable_nouns_cleaned)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9701af0e-440e-4cf7-886d-815fc720eb68",
"metadata": {},
"outputs": [],
"source": [
"with open('cleaned_nouns.csv', 'w', newline='') as csvfile:\n",
" writer = csv.writer(csvfile)\n",
" writer.writerows(single_syllable_nouns_cleaned)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}