/src/whoosh/lang/morph_en.py
Python | 941 lines | 928 code | 0 blank | 13 comment | 0 complexity | e5e41af733ed576b04622b79727d56d5 MD5 | raw file
Possible License(s): Apache-2.0
- """
- Contains the variations() function for expanding an English word into multiple
- variations by programatically adding and removing suffixes.
- Translated to Python from the ``com.sun.labs.minion.lexmorph.LiteMorph_en``
- class of Sun's `Minion search engine <https://minion.dev.java.net/>`_.
- """
- import re
- from whoosh.compat import xrange, iteritems
- # Rule exceptions
- exceptions = [
- "a",
- "abandoner abandon abandons abandoned abandoning abandonings abandoners",
- "abdomen abdomens",
- "about",
- "above",
- "acid acids acidic acidity acidities",
- "across",
- "act acts acted acting actor actors",
- "ad ads",
- "add adds added adding addings addition additions adder adders",
- "advertise advertises advertised advertising advertiser advertisers advertisement advertisements advertisings",
- "after",
- "again",
- "against",
- "ago",
- "all",
- "almost",
- "along",
- "already",
- "also",
- "although",
- "alumna alumnae alumnus alumni",
- "always",
- "amen amens",
- "amidships",
- "amid amidst",
- "among amongst",
- "an",
- "analysis analyses",
- "and",
- "another other others",
- "antenna antennas antennae",
- "antitheses antithesis",
- "any",
- "anyone anybody",
- "anything",
- "appendix appendixes appendices",
- "apropos",
- "aquarium aquariums aquaria",
- "argument arguments argue argues argued arguing arguings arguer arguers",
- "arise arises arose arisen ariser arisers arising arisings",
- "around",
- "as",
- "asbestos",
- "at",
- "atlas atlases",
- "auger augers augered augering augerings augerer augerers",
- "augment augments augmented augmenting augmentings augmentation augmentations augmenter augmenters",
- "automata automaton automatons",
- "automation automating automate automates automated automatic",
- "avoirdupois",
- "awake awakes awoke awaked awoken awaker awakers awaking awakings awakening awakenings",
- "away",
- "awful awfully awfulness",
- "axis axes axises",
- "bacillus bacilli",
- "bacterium bacteria",
- "bad worse worst badly badness",
- "bas",
- "bases basis",
- "bases base based basing basings basely baseness basenesses basement basements baseless basic basics",
- "be am are is was were been being",
- "bear bears bore borne bearing bearings bearer bearers",
- "beat beats beaten beating beatings beater beaters",
- "because",
- "become becomes became becoming",
- "beef beefs beeves beefed beefing",
- "beer beers",
- "before",
- "begin begins began begun beginning beginnings beginner beginners",
- "behalf behalves",
- "being beings",
- "bend bends bent bending bendings bender benders",
- "bereave bereaves bereaved bereft bereaving bereavings bereavement bereavements",
- "beside besides",
- "best bests bested besting",
- "bet bets betting bettor bettors",
- "betimes",
- "between",
- "beyond",
- "bid bids bade bidden bidding biddings bidder bidders",
- "bier biers",
- "bind binds bound binding bindings binder binders",
- "bit bits",
- "bite bites bit bitten biting bitings biter biters",
- "blackfoot blackfeet",
- "bleed bleeds bled bleeding bleedings bleeder bleeders",
- "blow blows blew blown blowing blowings blower blowers",
- "bookshelf bookshelves",
- "both",
- "bound bounds bounded bounding boundings bounder bounders boundless",
- "bourgeois bourgeoisie",
- "bra bras",
- "brahman brahmans",
- "break breaks broke broken breaking breakings breaker breakers",
- "breed breeds bred breeding breedings breeder breeders",
- "bring brings brought bringing bringings bringer bringers",
- "build builds built building buildings builder builders",
- "bus buses bused bussed busing bussing busings bussings buser busers busser bussers",
- "buss busses bussed bussing bussings busser bussers",
- "but",
- "buy buys bought buying buyings buyer buyers",
- "by",
- "calf calves calved calving calvings calver calvers",
- "can cans canned canning cannings canner canners",
- "can could cannot",
- "canoes canoe canoed canoeing canoeings canoer canoers",
- "catch catches caught catching catchings catcher catchers",
- "cement cements cemented cementing cementings cementer cementers",
- "cent cents",
- "center centers centered centering centerings centerless",
- "child children childless childish childishly",
- "choose chooses chose chosen choosing choosings chooser choosers",
- "cling clings clung clinging clingings clinger clingers",
- "colloquium colloquia colloquiums",
- "come comes came coming comings comer comers",
- "comment comments commented commenting commentings commenter commenters",
- "compendium compendia compendiums",
- "complement complements complemented complementing complementings complementer complementers complementary",
- "compliment compliments complimented complimenting complimentings complimenter complimenters complimentary",
- "concerto concertos concerti",
- "condiment condiments",
- "corps",
- "cortex cortices cortexes cortical",
- "couscous",
- "creep creeps crept creeping creepings creeper creepers creepy",
- "crisis crises",
- "criterion criteria criterial",
- "cryptanalysis cryptanalyses",
- "curriculum curricula curriculums curricular",
- "datum data",
- "day days daily",
- "deal deals dealt dealing dealings dealer dealers",
- "decrement decrements decremented decrementing decrementings decrementer decrementers decremental",
- "deer deers",
- "demented dementia",
- "desideratum desiderata",
- "diagnosis diagnoses diagnose diagnosed diagnosing diagnostic",
- "dialysis dialyses",
- "dice dices diced dicing dicings dicer dicers",
- "die dice",
- "die dies died dying dyings",
- "dig digs dug digging diggings digger diggers",
- "dive dives diver divers dove dived diving divings",
- "divest divests divester divesters divested divesting divestings divestment divestments",
- "do does did done doing doings doer doers",
- "document documents documented documenting documentings documenter documenters documentation documentations documentary",
- "doe does",
- "dove doves",
- "downstairs",
- "dozen",
- "draw draws drew drawn drawing drawings drawer drawers",
- "drink drinks drank drunk drinking drinkings drinker drinkers",
- "drive drives drove driven driving drivings driver drivers driverless",
- "due dues duly",
- "during",
- "e",
- "each",
- "eager eagerer eagerest eagerly eagerness eagernesses",
- "early earlier earliest",
- "easement easements",
- "eat eats ate eaten eating eatings eater eaters",
- "effluvium effluvia",
- "either",
- "element elements elementary",
- "elf elves elfen",
- "ellipse ellipses elliptic elliptical elliptically",
- "ellipsis ellipses elliptic elliptical elliptically",
- "else",
- "embolus emboli embolic embolism",
- "emolument emoluments",
- "emphasis emphases",
- "employ employs employed employing employer employers employee employees employment employments employable",
- "enough",
- "equilibrium equilibria equilibriums",
- "erratum errata",
- "ever",
- "every",
- "everything",
- "exotic exotically exoticness exotica",
- "experiment experiments experimented experimenting experimentings experimenter experimenters experimentation experimental",
- "extra extras",
- "fall falls fell fallen falling fallings faller fallers",
- "far farther farthest",
- "fee fees feeless",
- "feed feeds fed feeding feedings feeder feeders",
- "feel feels felt feeling feelings feeler feelers",
- "ferment ferments fermented fermenting fermentings fermentation fermentations fermenter fermenters",
- "few fewer fewest",
- "fight fights fought fighting fightings fighter fighters",
- "figment figments",
- "filament filaments",
- "find finds found finding findings finder finders",
- "firmament firmaments",
- "flee flees fled fleeing fleeings",
- "fling flings flung flinging flingings flinger flingers",
- "floe floes",
- "fly flies flew flown flying flyings flier fliers flyer flyers",
- "focus foci focuses focused focusing focusses focussed focussing focuser focal",
- "foment foments fomented fomenting fomentings fomenter fomenters",
- "foot feet",
- "foot foots footed footing footer footers",
- "footing footings footer footers",
- "for",
- "forbid forbids forbade forbidden forbidding forbiddings forbidder forbidders",
- "foresee foresaw foreseen foreseeing foreseeings foreseer foreseers",
- "forest forests forester foresting forestation forestations",
- "forget forgets forgot forgotten forgetting forgettings forgetter forgetters forgetful",
- "forsake forsakes forsook forsaken forsaking forsakings forsaker forsakers",
- "found founds founded founding foundings founder founders",
- "fragment fragments fragmented fragmenting fragmentings fragmentation fragmentations fragmenter fragmenters",
- "free frees freer freest freed freeing freely freeness freenesses",
- "freeze freezes froze frozen freezing freezings freezer freezers",
- "from",
- "full fully fuller fullest",
- "fuller fullers full fulls fulled fulling fullings",
- "fungus fungi funguses fungal",
- "gallows",
- "ganglion ganglia ganglions ganglionic",
- "garment garments",
- "gas gasses gassed gassing gassings gasser gassers",
- "gas gases gasses gaseous gasless",
- "gel gels gelled gelling gellings geller gellers",
- "german germans germanic germany German Germans Germanic Germany",
- "get gets got gotten getting gettings getter getters",
- "give gives gave given giving givings giver givers",
- "gladiolus gladioli gladioluses gladiola gladiolas gladiolae",
- "glans glandes",
- "gluiness gluey glue glues glued gluing gluings gluer gluers",
- "go goes went gone going goings goer goers",
- "godchild godchildren",
- "good better best goodly goodness goodnesses",
- "goods",
- "goose geese",
- "goose gooses goosed goosing goosings gooser goosers",
- "grandchild grandchildren",
- "grind grinds ground grinding grindings grinder grinders",
- "ground grounds grounded grounding groundings grounder grounders groundless",
- "grow grows grew grown growing growings grower growers growth",
- "gum gums gummed gumming gummings gummer gummers",
- "half halves",
- "halve halves halved halving halvings halver halvers",
- "hang hangs hung hanged hanging hangings hanger hangers",
- "have has had having havings haver havers",
- "he him his himself",
- "hear hears heard hearing hearings hearer hearers",
- "here",
- "hide hides hid hidden hiding hidings hider hiders",
- "hippopotamus hippopotami hippopotamuses",
- "hold holds held holding holdings holder holders",
- "honorarium honoraria honorariums",
- "hoof hoofs hooves hoofed hoofing hoofer hoofers",
- "how",
- "hum hums hummed humming hummings hummer hummers",
- "hymen hymens hymenal",
- "hypotheses hypothesis hypothesize hypothesizes hypothesized hypothesizer hypothesizing hypothetical hypothetically",
- "i",
- "if iffy",
- "impediment impediments",
- "implement implements implemented implementing implementings implementation implementations implementer implementers",
- "imply implies implied implying implyings implier impliers",
- "in inner",
- "inclement",
- "increment increments incremented incrementing incrementings incrementer incrementers incremental incrementally",
- "index indexes indexed indexing indexings indexer indexers",
- "index indexes indices indexical indexicals",
- "indoor indoors",
- "instrument instruments instrumented instrumenting instrumentings instrumenter instrumenters instrumentation instrumentations instrumental",
- "integument integumentary",
- "into",
- "it its itself",
- "java",
- "july julys",
- "keep keeps kept keeping keepings keeper keepers",
- "knife knifes knifed knifing knifings knifer knifers",
- "knife knives",
- "know knows knew known knowing knowings knower knowers knowledge",
- "lament laments lamented lamenting lamentings lamentation lamentations lamenter lamenters lamentable lamentably",
- "larva larvae larvas larval",
- "late later latest lately lateness",
- "latter latterly",
- "lay lays laid laying layer layers",
- "layer layers layered layering layerings",
- "lead leads led leading leadings leader leaders leaderless",
- "leaf leafs leafed leafing leafings leafer leafers",
- "leaf leaves leafless",
- "leave leaves left leaving leavings leaver leavers",
- "lend lends lent lending lendings lender lenders",
- "less lesser least",
- "let lets letting lettings",
- "lie lies lay lain lying lier liers",
- "lie lies lied lying liar liars",
- "life lives lifeless",
- "light lights lit lighted lighting lightings lightly lighter lighters lightness lightnesses lightless",
- "likely likelier likeliest",
- "limen limens",
- "lineament lineaments",
- "liniment liniments",
- "live alive living",
- "live lives lived living livings",
- "liver livers",
- "loaf loafs loafed loafing loafings loafer loafers",
- "loaf loaves",
- "logic logics logical logically",
- "lose loses lost losing loser losers loss losses",
- "louse lice",
- "lumen lumens",
- "make makes made making makings maker makers",
- "man mans manned manning mannings",
- "man men",
- "manly manlier manliest manliness manful manfulness manhood",
- "manic manically",
- "manner manners mannered mannerly mannerless mannerful",
- "many",
- "matrix matrices matrixes",
- "may might",
- "maximum maxima maximums maximal maximize maximizes maximized maximizing",
- "mean means meant meaning meanings meaningless meaningful",
- "mean meaner meanest meanly meanness meannesses",
- "median medians medianly medial",
- "medium media mediums",
- "meet meets met meeting meetings",
- "memorandum memoranda memorandums",
- "mere merely",
- "metal metals metallic",
- "might mighty mightily",
- "millenium millennia milleniums millennial",
- "mine mines mined mining minings miner miners",
- "mine my our ours",
- "minimum minima minimums minimal",
- "minus minuses",
- "miscellaneous miscellanea miscellaneously miscellaneousness miscellany",
- "molest molests molested molesting molestings molester molesters",
- "moment moments",
- "monument monuments monumental",
- "more most",
- "mouse mice mouseless",
- "much",
- "multiply multiplies multiplier multipliers multiple multiples multiplying multiplyings multiplication multiplications",
- "mum mums mummed mumming mummings mummer mummers",
- "must musts",
- "neither",
- "nemeses nemesis",
- "neurosis neuroses neurotic neurotics",
- "nomen",
- "none",
- "nos no noes",
- "not",
- "nothing nothings nothingness",
- "now",
- "nowadays",
- "nucleus nuclei nucleuses nuclear",
- "number numbers numbered numbering numberings numberless",
- "nutriment nutriments nutrient nutrients nutrition nutritions",
- "oasis oases",
- "octopus octopi octopuses",
- "of",
- "off",
- "offer offers offered offering offerings offerer offerers offeror offerors",
- "often",
- "oftentimes",
- "ointment ointments",
- "omen omens",
- "on",
- "once",
- "only",
- "ornament ornaments ornamented ornamenting ornamentings ornamentation ornamenter ornamenters ornamental",
- "outdoor outdoors",
- "outlay outlays",
- "outlie outlies outlay outlied outlain outlying outlier outliers",
- "ovum ova",
- "ox oxen",
- "parentheses parenthesis",
- "parliament parliaments parliamentary",
- "passerby passer-by passersby passers-by",
- "past pasts",
- "pay pays paid paying payings payer payers payee payees payment payments",
- "per",
- "perhaps",
- "person persons people",
- "phenomenon phenomena phenomenal",
- "pi",
- "picnic picnics picnicker picnickers picnicked picnicking picnickings",
- "pigment pigments pigmented pigmenting pigmentings pigmenter pigmenters pigmentation pigmentations",
- "please pleases pleased pleasing pleasings pleaser pleasers pleasure pleasures pleasuring pleasurings pleasant pleasantly pleasureless pleasureful",
- "plus pluses plusses",
- "polyhedra polyhedron polyhedral",
- "priest priests priestly priestlier priestliest priestliness priestless",
- "prognosis prognoses",
- "prostheses prosthesis",
- "prove proves proved proving provings proofs proof prover provers provable",
- "psychosis psychoses psychotic psychotics",
- "qed",
- "quiz quizzes quizzed quizzing quizzings quizzer quizzers",
- "raiment",
- "rather",
- "re",
- "real really",
- "redo redoes redid redone redoing redoings redoer redoers",
- "regiment regiments regimented regimenting regimenter regimenters regimentation regimental",
- "rendezvous",
- "requiz requizzes requizzed requizzing requizzings requizzer requizzers",
- "ride rides rode ridden riding ridings rider riders rideless",
- "ring rings rang rung ringing ringings ringer ringers ringless",
- "rise rises rose risen rising risings riser risers",
- "rose roses",
- "rudiment rudiments rudimentary",
- "rum rums rummed rumming rummings rummer rummers",
- "run runs ran running runnings runner runners",
- "sacrament sacraments sacramental",
- "same sameness",
- "sans",
- "saw saws sawed sawn sawing sawings sawyer sawyers",
- "say says said saying sayings sayer sayers",
- "scarf scarfs scarves scarfless",
- "schema schemata schemas",
- "sediment sediments sedimentary sedimentation sedimentations",
- "see sees saw seen seeing seeings seer seers",
- "seek seeks sought seeking seekings seeker seekers",
- "segment segments segmented segmenting segmentings segmenter segmenters segmentation segmentations",
- "self selves selfless",
- "sell sells sold selling sellings seller sellers",
- "semen",
- "send sends sent sending sendings sender senders",
- "sentiment sentiments sentimental",
- "series",
- "set sets setting settings",
- "several severally",
- "sew sews sewed sewn sewing sewings sewer sewers",
- "sewer sewers sewerless",
- "shake shakes shook shaken shaking shakings shaker shakers",
- "shall should",
- "shaman shamans",
- "shave shaves shaved shaven shaving shavings shaver shavers shaveless",
- "she her hers herself",
- "sheaf sheaves sheafless",
- "sheep",
- "shelf shelves shelved shelfing shelvings shelver shelvers shelfless",
- "shine shines shined shone shining shinings shiner shiners shineless",
- "shoe shoes shoed shod shoeing shoeings shoer shoers shoeless",
- "shoot shoots shot shooting shootings shooter shooters",
- "shot shots",
- "show shows showed shown showing showings shower showers",
- "shower showers showery showerless",
- "shrink shrinks shrank shrunk shrinking shrinkings shrinker shrinkers shrinkable",
- "sideways",
- "simply simple simpler simplest",
- "since",
- "sing sings sang sung singing singings singer singers singable",
- "sink sinks sank sunk sinking sinkings sinker sinkers sinkable",
- "sit sits sat sitting sittings sitter sitters",
- "ski skis skied skiing skiings skier skiers skiless skiable",
- "sky skies",
- "slay slays slew slain slaying slayings slayer slayers",
- "sleep sleeps slept sleeping sleepings sleeper sleepers sleepless",
- "so",
- "some",
- "something",
- "sometime sometimes",
- "soon",
- "spa spas",
- "speak speaks spoke spoken speaking speakings speaker speakers",
- "species specie",
- "spectrum spectra spectrums",
- "speed speeds sped speeded speeding speedings speeder speeders",
- "spend spends spent spending spendings spender spenders spendable",
- "spin spins spun spinning spinnings spinner spinners",
- "spoke spokes",
- "spring springs sprang sprung springing springings springer springers springy springiness",
- "staff staffs staves staffed staffing staffings staffer staffers",
- "stand stands stood standing standings",
- "stasis stases",
- "steal steals stole stolen stealing stealings stealer stealers",
- "stick sticks stuck sticking stickings sticker stickers",
- "stigma stigmata stigmas stigmatize stigmatizes stigmatized stigmatizing",
- "stimulus stimuli",
- "sting stings stung stinging stingings stinger stingers",
- "stink stinks stank stunk stinking stinkings stinker stinkers",
- "stomach stomachs",
- "stratum strata stratums",
- "stride strides strode stridden striding stridings strider striders",
- "string strings strung stringing stringings stringer stringers stringless",
- "strive strives strove striven striving strivings striver strivers",
- "strum strums strummed strumming strummings strummer strummers strummable",
- "such",
- "suffer suffers suffered suffering sufferings sufferer sufferers sufferable",
- "suggest suggests suggested suggesting suggestings suggester suggesters suggestor suggestors suggestive suggestion suggestions suggestible suggestable",
- "sum sums summed summing summings summer summers",
- "summer summers summered summering summerings",
- "supplement supplements supplemented supplementing supplementings supplementation supplementer supplementers supplementary supplemental",
- "supply supplies supplied supplying supplyings supplier suppliers",
- "swear swears swore sworn swearing swearings swearer swearers",
- "sweep sweeps swept sweeping sweepings sweeper sweepers",
- "swell swells swelled swollen swelling swellings",
- "swim swims swam swum swimming swimmings swimmer swimmers swimable",
- "swine",
- "swing swings swung swinging swingings swinger swingers",
- "syllabus syllabi syllabuses",
- "symposium symposia symposiums",
- "synapse synapses",
- "synapsis synapses",
- "synopsis synopses",
- "synthesis syntheses",
- "tableau tableaux tableaus",
- "take takes took taken taking takings taker takers takable",
- "teach teaches taught teaching teachings teacher teachers teachable",
- "tear tears tore torn tearing tearings tearer tearers tearable",
- "tegument teguments",
- "tell tells told telling tellings teller tellers tellable",
- "temperament temperaments temperamental temperamentally",
- "tenement tenements",
- "the",
- "there theres",
- "theses thesis",
- "they them their theirs themselves",
- "thief thieves thieving thievings",
- "think thinks thought thinking thinker thinkers thinkable",
- "this that these those",
- "thought thoughts thougtful thoughtless",
- "throw throws threw thrown throwing throwings thrower throwers throwable",
- "tic tics",
- "tie ties tied tying tyings tier tiers tieable tieless",
- "tier tiers tiered tiering tierings tierer tierers",
- "to",
- "toe toes toed toeing toeings toer toers toeless",
- "together togetherness",
- "too",
- "tooth teeth toothless",
- "topaz topazes",
- "torment torments tormented tormenting tormentings tormenter tormenters tormentable",
- "toward towards",
- "tread treads trod trodden treading treadings treader treaders",
- "tread treads treadless retread retreads",
- "true truly trueness",
- "two twos",
- "u",
- "under",
- "underlay underlays underlaid underlaying underlayings underlayer underlayers",
- "underlie underlies underlay underlain underlying underlier underliers",
- "undo undoes undid undone undoing undoings undoer undoers undoable",
- "unrest unrestful",
- "until",
- "unto",
- "up",
- "upon",
- "upstairs",
- "use uses user users used using useful useless",
- "various variously",
- "vehement vehemently vehemence",
- "versus",
- "very",
- "visit visits visited visiting visitings visitor visitors",
- "vortex vortexes vortices",
- "wake wakes woke waked woken waking wakings waker wakers wakeful wakefulness wakefulnesses wakeable",
- "wear wears wore worn wearing wearings wearer wearers wearable",
- "weather weathers weathered weathering weatherly",
- "weave weaves wove woven weaving weavings weaver weavers weaveable",
- "weep weeps wept weeping weepings weeper weepers",
- "wharf wharfs wharves",
- "where wheres",
- "whereas whereases",
- "whether whethers",
- "while whiles whilst whiled whiling",
- "whiz whizzes whizzed whizzing whizzings whizzer whizzers",
- "who whom whos whose whoses",
- "why whys",
- "wife wives wifeless",
- "will wills willed willing willings willful",
- "will would",
- "win wins won winning winnings winner winners winnable",
- "wind winds wound winding windings winder winders windable",
- "wind winds windy windless",
- "with",
- "within",
- "without",
- "wolf wolves",
- "woman women womanless womanly",
- "wound wounds wounded wounding woundings",
- "write writes wrote written writing writings writer writers writeable",
- "yeses yes",
- "yet yets",
- "you your yours yourself"
- ]
- _exdict = {}
- for exlist in exceptions:
- for ex in exlist.split(" "):
- _exdict[ex] = exlist
- # Programmatic rules
- vowels = "aeiouy"
- cons = "bcdfghjklmnpqrstvwxyz"
- rules = (
- # Words ending in S
- # (e.g., happiness, business)
- (r"[%s].*[%s](iness)" % (vowels, cons), "y,ies,ier,iers,iest,ied,ying,yings,ily,inesses,iment,iments,iless,iful"),
- # (e.g., baseless, shoeless)
- (r"[%s].*(eless)" % vowels, "e,es,er,ers,est,ed,ing,ings,eing,eings,ely,eness,enesses,ement,ements,eness,enesses,eful"),
- # (e.g., gutless, hatless, spotless)
- (r"[%s][%s][bdgklmnprt]?(less)" % (cons, vowels), ",s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,ful"),
- # (e.g., thoughtless, worthless)
- (r"[%s].*?(less)" % vowels, ",s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,ful"),
- # (e.g., baseness, toeness)
- (r"[%s].*(eness)" % vowels, "e,es,er,ers,est,ed,ing,ings,eing,eings,ely,enesses,ement,ements,eless,eful"),
- # (e.g., bluntness, grayness)
- (r"[%s].*(ness)" % vowels, ",s,er,ers,est,ed,ing,ings,ly,nesses,ment,ments,less,ful"),
- # (e.g., albatross, kiss)
- (r"[%s]ss" % vowels, "es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., joyous, fractious, gaseous)
- (r"[%s].*(ous)" % vowels, "ly,ness"),
- # (e.g., tries, unties, jollies, beauties)
- (r"(ies)", "y,ie,yer,yers,ier,iers,iest,ied,ying,yings,yness,iness,ieness,ynesses,inesses,ienesses,iment,iement,iments,iements,yless,iless,ieless,yful,iful,ieful"),
- # (e.g., crisis, kinesis)
- (r"[%s].*(sis)" % vowels, "ses,sises,sisness,sisment,sisments,sisless,sisful"),
- # (e.g., bronchitis, bursitis)
- (r"[%s].*(is)" % vowels, "es,ness,ment,ments,less,ful"),
- (r"[%s].*[cs]h(es)" % vowels, ",e,er,ers,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"),
- # (e.g., tokenizes) // adds British variations
- (r"[%s].*[%s](izes)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations"),
- # (e.g., tokenises) // British variant // ~expertise
- (r"[%s].*[%s](ises)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations"),
- # (e.g., aches, arches)
- (r"[%s].*[jsxz](es)" % vowels, ",e,er,ers,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"),
- # (e.g., judges, abridges)
- (r"[%s].*dg(es)" % vowels, "e,er,ers,est,ed,ing,ings,ely,eness,enesses,ment,ments,ement,ements,eless,eful"),
- # (e.g., trees, races, likes, agrees) covers all other -es words
- (r"e(s)", ",*"),
- # (e.g., segments, bisegments, cosegments)
- (r"segment(s)", ",*"),
- # (e.g., pigments, depigments, repigments)
- (r"pigment(s)", ",*"),
- # (e.g., judgments, abridgments)
- (r"[%s].*dg(ments)" % vowels, "ment,*ments"),
- # (e.g., merriments, embodiments) -iment in turn will generate y and *y (redo y)
- (r"[%s].*[%s]iment(s)" % (vowels, cons), ",*"),
- # (e.g., atonements, entrapments)
- (r"[%s].*ment(s)" % vowels, ",*"),
- # (e.g., viewers, meters, traders, transfers)
- (r"[%s].*er(s)" % vowels, ",*"),
- # (e.g., unflags) polysyllables
- (r"[%s].*[%s][%s][bdglmnprt](s)" % (vowels, cons, vowels), ",*"),
- # (e.g., frogs) monosyllables
- (r"[%s][%s][bdglmnprt](s)" % (vowels, cons), ",*"),
- # (e.g., killings, muggings)
- (r"[%s].*ing(s)" % vowels, ",*"),
- # (e.g., hulls, tolls)
- (r"[%s].*ll(s)" % vowels, ",*"),
- # e.g., boas, polkas, spas) don't generate latin endings
- (r"a(s)", ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., beads, toads)
- (r"[%s].*[%s].*(s)" % (vowels, cons), ",*"),
- # (e.g., boas, zoos)
- (r"[%s].*[%s](s)" % (cons, vowels), ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., ss, sss, ssss) no vowel (vowel case is already handled above)
- (r"ss()", ""),
- # (e.g., cds, lcds, m-16s) no vowel (can be a plural noun, but not verb)
- (r"[%s].*[%s1234567890](s)" % (cons, cons), ""),
- # Words ending in E
- # (e.g., apple, so it doesn't include apply)
- (r"appl(e)", "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ement,ements,eless,eful"),
- # (e.g., supple, so it doesn't include supply)
- (r"suppl(e)", "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ement,ements,eless,eful"),
- # (e.g., able, abominable, fungible, table, enable, idle, subtle)
- (r"[%s].*[%s]l(e)" % (vowels, cons), "es,er,ers,est,ed,ing,ings,y,ely,eness,enesses,ement,ements,eless,eful"),
- # (e.g., bookie, magpie, vie)
- (r"(ie)", "ies,ier,iers,iest,ied,ying,yings,iely,ieness,ienesses,iement,iements,ieless,ieful"),
- # (e.g., dye, redye, redeye)
- (r"ye()", "s,r,rs,st,d,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., judge, abridge)
- (r"[%s].*dg(e)" % vowels, "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ment,ments,less,ful,ement,ements,eless,eful"),
- # (e.g., true, due, imbue)
- (r"u(e)", "es,er,ers,est,ed,ing,ings,eing,eings,ly,ely,eness,enesses,ment,ments,less,ful,ement,ements,eless,eful"),
- # (e.g., tokenize) // adds British variations
- (r"[%s].*[%s](ize)" % (vowels, cons), "izes,izer,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"),
- # (e.g., tokenise) // British variant // ~expertise
- (r"[%s].*[%s](ise)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ising,isings,isation,isations"),
- # (e.g., tree, agree, rage, horse, hoarse)
- (r"[%s].*[%s](e)" % (vowels, cons), "es,er,ers,est,ed,ing,ings,eing,eings,ely,eness,enesses,ement,ements,eless,eful"),
- # Words ending in -ED
- # (e.g., agreed, freed, decreed, treed)
- (r"ree(d)", "ds,der,ders,ded,ding,dings,dly,dness,dnesses,dment,dments,dless,dful,,*"),
- # (e.g., feed, seed, Xweed)
- (r"ee(d)", "ds,der,ders,ded,ding,dings,dly,dness,dnesses,dment,dments,dless,dful"),
- # (e.g., tried)
- (r"[%s](ied)" % cons, "y,ie,ies,ier,iers,iest,ying,yings,ily,yly,iness,yness,inesses,ynesses,iment,iments,iless,iful,yment,yments,yless,yful"),
- # (e.g., controlled, fulfilled, rebelled)
- (r"[%s].*[%s].*l(led)" % (vowels, cons), ",s,er,ers,est,ing,ings,ly,ness,nesses,ment,ments,less,ful,&,&s,&er,&ers,&est,&ing,&ings,&y,&ness,&nesses,&ment,&ments,&ful"),
- # (e.g., pulled, filled, fulled)
- (r"[%s].*l(led)" % vowels, "&,&s,&er,&ers,&est,&ing,&ings,&y,&ness,&nesses,&ment,&ments,&ful"),
- # (e.g., hissed, grossed)
- (r"[%s].*s(sed)" % vowels, "&,&es,&er,&ers,&est,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"),
- # (e.g., hugged, trekked)
- (r"[%s][%s](?P<ed1>[bdgklmnprt])((?P=ed1)ed)", ",s,&er,&ers,&est,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., tokenize) // adds British variations
- (r"[%s].*[%s](ized)" % (vowels, cons), "izes,izer,izers,ize,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"),
- # (e.g., tokenise) // British variant // ~expertise
- (r"[%s].*[%s](ized)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ise,ising,isings,isation,isations"),
- # (e.g., spoiled, tooled, tracked, roasted, atoned, abridged)
- (r"[%s].*(ed)" % vowels, ",e,s,es,er,ers,est,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"),
- # (e.g., bed, sled) words with a single e as the only vowel
- (r"ed()", "s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"),
- # Words ending in -ER
- # (e.g., altimeter, ammeter, odometer, perimeter)
- (r"meter()", "s,er,ers,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., agreer, beer, budgeteer, engineer, freer)
- (r"eer()", "eers,eered,eering,eerings,eerly,eerness,eernesses,eerment,eerments,eerless,eerful,ee,ees,eest,eed,eeing,eeings,eely,eeness,eenesses,eement,eements,eeless,eeful,eerer,eerers,eerest"),
- # (e.g., acidifier, saltier)
- (r"[%s].*[%s](ier)" % (vowels, cons), "y,ie,ies,iest,ied,ying,yings,ily,yly,iness,yness,inesses,ynesses,yment,yments,yless,yful,iment,iments,iless,iful,iers,iered,iering,ierings,ierly,ierness,iernesses,ierment,ierments,ierless,ierful,ierer,ierers,ierest"),
- # (e.g., puller, filler, fuller)
- (r"[%s].*l(ler)" % vowels, "&,&s,&est,&ed,&ing,&ings,ly,lely,&ness,&nesses,&ment,&ments,&ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erments,&erless,&erful"),
- # (e.g., hisser, grosser)
- (r"[%s].*s(ser)" % vowels, "&,&es,&est,&ed,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erment,&erments,&erless,&erful"),
- # (e.g., bigger, trekker, hitter)
- (r"[%s][%s](?P<er1>[bdgkmnprt])((?P=er1)er)" % (cons, vowels), "s,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erments,&erless,&erful"),
- # (e.g., tokenize) // adds British variations
- (r"[%s].*[%s](izer)" % (vowels, cons), "izes,ize,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"),
- # (e.g., tokenise) // British variant // ~expertise
- (r"[%s].*[%s](iser)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,ise,isers,ised,ising,isings,isation,isations"),
- #(e.g., actioner, atoner, icer, trader, accruer, churchgoer, prefer)
- (r"[%s].*(er)" % vowels, ",e,s,es,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ers,ered,erred,ering,erring,erings,errings,erly,erness,ernesses,erment,erments,erless,erful,erer,erers,erest,errer,errers,errest"),
- # Words ending in -EST
- # (e.g., sliest, happiest, wittiest)
- (r"[%s](iest)" % cons, "y,ies,ier,iers,ied,ying,yings,ily,yly,iness,yness,inesses,ynesses,iment,iments,iless,iful"),
- # (e.g., fullest)
- (r"[%s].*l(lest)" % vowels, "&,&s,&er,&ers,&ed,&ing,&ings,ly,&ness,&nesses,&ment,&ments,&ful"),
- # (e.g., grossest)
- (r"[%s].*s(sest)" % vowels, "&,&es,&er,&ers,&ed,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"),
- # (e.g., biggest)
- (r"[%s][%s](?P<est1>[bdglmnprst])((?P=est1)est)" % (cons, vowels), ",s,&er,&ers,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., basest, archest, rashest)
- (r"[%s].*([cs]h|[jsxz])(est)" % vowels, "e,es,er,ers,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"),
- # (e.g., severest, Xinterest, merest)
- (r"er(est)", "e,es,er,ers,ed,eing,eings,ely,eness,enesses,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"),
- # (e.g., slickest, coolest, ablest, amplest, protest, quest)
- (r"[%s].*(est)" % vowels, ",e,s,es,er,ers,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"),
- # (e.g., rest, test)
- (r"est", "s,er,ers,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # Words ending in -FUL
- # (e.g., beautiful, plentiful)
- (r"[%s].*[%s](iful)" % (vowels, cons), "ifully,ifulness,*y"),
- # (e.g., hopeful, sorrowful)
- (r"[%s].*(ful)" % vowels, "fully,fulness,,*"),
- # Words ending in -ICAL
- (r"[%s].*(ical)" % vowels, "ic,ics,ically"),
- # Words ending in -IC
- (r"[%s].*(ic)" % vowels, "ics,ical,ically"),
- # Words ending in -ING
- # (e.g., dying, crying, supplying)
- (r"[%s](ying)" % cons, "yings,ie,y,ies,ier,iers,iest,ied,iely,yly,ieness,yness,ienesses,ynesses,iment,iments,iless,iful"),
- # (e.g., pulling, filling, fulling)
- (r"[%s].*l(ling)" % vowels, ",*,&,&s,&er,&ers,&est,&ed,&ings,&ness,&nesses,&ment,&ments,&ful"),
- # (e.g., hissing, grossing, processing)
- (r"[%s].*s(sing)" % vowels, "&,&s,&er,&ers,&est,&ed,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"),
- # (e.g., hugging, trekking)
- (r"[%s][%s](?P<ing1>[bdgklmnprt])((?P=ing1)ing)" % (cons, vowels), ",s,&er,&ers,&est,&ed,&ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., freeing, agreeing)
- (r"eeing()", "ee,ees,eer,eers,eest,eed,eeings,eely,eeness,eenesses,eement,eements,eeless,eeful"),
- # (e.g., ageing, aweing)
- (r"[%s].*(eing)" % vowels, "e,es,er,ers,est,ed,eings,ely,eness,enesses,ement,ements,eless,eful"),
- # (e.g., toying, playing)
- (r"[%s].*y(ing)" % vowels, ",s,er,ers,est,ed,ings,ly,ingly,ness,nesses,ment,ments,less,ful"),
- # (e.g., editing, crediting, expediting, siting, exciting)
- (r"[%s].*[%s][eio]t(ing)" % (vowels, cons), ",*,*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"),
- # (e.g., robing, siding, doling, translating, flaking)
- (r"[%s][%s][bdgklmt](ing)" % (cons, vowels), "*e,ings,inger,ingers,ingest,inged,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"),
- # (e.g., tokenize) // adds British variations
- (r"[%s].*[%s](izing)" % (vowels, cons), "izes,izer,izers,ized,ize,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"),
- # (e.g., tokenise) // British variant // ~expertise
- (r"[%s].*[%s](ising)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ise,isings,isation,isations"),
- # (e.g., icing, aging, achieving, amazing, housing)
- (r"[%s][cgsvz](ing)" % vowels, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"),
- # (e.g., dancing, troubling, arguing, bluing, carving)
- (r"[%s][clsuv](ing)" % cons, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"),
- # (e.g., charging, bulging)
- (r"[%s].*[lr]g(ing)" % vowels, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"),
- # (e.g., farming, harping, interesting, bedspring, redwing)
- (r"[%s].*[%s][bdfjkmnpqrtwxz](ing)" % (vowels, cons), ",*,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"),
- # (e.g., spoiling, reviling, autoing, egging, hanging, hingeing)
- (r"[%s].*(ing)" % vowels, ",*,*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"),
- # (e.g., wing, thing) monosyllables
- (r"(ing)", "ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"),
- # -LEAF rules omitted
- # Words ending in -MAN
- # (e.g., policewomen, hatchetmen, dolmen)
- (r"(man)", "man,mens,mener,meners,menest,mened,mening,menings,menly,menness,mennesses,menless,menful"),
- # Words ending in -MENT
- # (e.g., segment, bisegment, cosegment, pigment, depigment, repigment)
- (r"segment|pigment", "s,ed,ing,ings,er,ers,ly,ness,nesses,less,ful"),
- # (e.g., judgment, abridgment)
- (r"[%s].*dg(ment)" % vowels, "*e"),
- # (e.g., merriment, embodiment)
- (r"[%s].*[%s](iment)" % (vowels, cons), "*y"),
- # (e.g., atonement, entrapment)
- (r"[%s].*[%s](ment)" % (vowels, cons), ",*"),
- # Words ending in -O
- # (e.g., taboo, rodeo)
- (r"[%s]o()" % vowels, "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., tomato, bonito)
- (r"[%s].*o()" % vowels, "s,es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # Words ending in -UM
- # (e.g., datum, quantum, tedium, strum, [oil]drum, vacuum)
- (r"[%s].*(um)" % vowels, "a,ums,umer,ummer,umers,ummers,umed,ummed,uming,umming,umings,ummings,umness,umments,umless,umful"),
- # Words ending in -Y
- # (e.g., ably, horribly, wobbly)
- (r"[%s].*b(ly)" % vowels, "le,les,ler,lers,lest,led,ling,lings,leness,lenesses,lement,lements,leless,leful"),
- # (e.g., happily, dizzily)
- (r"[%s].*[%s](ily)" % (vowels, cons), "y,ies,ier,iers,iest,ied,ying,yings,yness,iness,ynesses,inesses,iment,iments,iless,iful"),
- # (e.g., peaceful+ly)
- (r"[%s].*ful(ly)" % vowels, ",*"),
- # (e.g., fully, folly, coolly, fatally, dally)
- (r"[%s].*l(ly)" % vowels, ",*,lies,lier,liers,liest,lied,lying,lyings,liness,linesses,liment,liments,liless,liful,*l"),
- # (e.g., monopoly, Xcephaly, holy)
- (r"[%s](ly)" % vowels, "lies,lier,liers,liest,lied,lying,lyings,liness,linesses,liment,liments,liless,liful"),
- # (e.g., frequently, comely, deeply, apply, badly)
- (r"[%s].*(ly)" % vowels, ",*,lies,lier,liers,liest,lied,lying,lyings,liness,linesses,lyless,lyful"),
- # (e.g., happy, ply, spy, cry)
- (r"[%s](y)" % cons, "ies,ier,iers,iest,ied,ying,yings,ily,yness,iness,ynesses,inesses,iment,iments,iless,iful,yment,yments,yless,yful"),
- # (e.g., betray, gay, stay)
- (r"[%s]y()" % vowels, "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # Root rules
- # (e.g., fix, arch, rash)
- (r"[%s].*(ch|sh|[jxz])()" % vowels, "es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., unflag, open, besot)
- (r"[%s].*[%s][%s][bdglmnprt]()" % (vowels, cons, vowels), "s,er,ers,est,ed,ing,ings,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., bed, cop)
- (r"[%s][%s][bdglmnprt]()" % (cons, vowels), "s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"),
- # (e.g., schemata, automata)
- (r"[%s].*[%s][%s]ma(ta)" % (vowels, cons, vowels), ",s,tas,tum,tums,ton,tons,tic,tical"),
- # (e.g., chordata, data, errata, sonata, toccata)
- (r"[%s].*t(a)" % vowels, "as,ae,um,ums,on,ons,ic,ical"),
- # (e.g., polka, spa, schema, ova, polyhedra)
- (r"[%s].*[%s](a)" % (vowels, cons), "as,aed,aing,ae,ata,um,ums,on,ons,al,atic,atical"),
- # (e.g., full)
- (r"[%s].*ll()" % vowels, "s,er,ers,est,ed,ing,ings,y,ness,nesses,ment,ments,-less,ful"),
- # (e.g., spoon, rhythm)
- (r"[%s].*()", "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"),
- )
- # There are a limited number of named groups available in a single
- # regular expression, so we'll partition the list of rules into
- # smaller chunks.
- _partition_size = 20
- _partitions = []
- for p in xrange(0, len(rules) // _partition_size + 1):
- start = p * _partition_size
- end = (p + 1) * _partition_size
- pattern = "|".join("(?P<_g%s>%s)$" % (i, r[0])
- for i, r in enumerate(rules[start:end]))
- _partitions.append(re.compile(pattern))
- def variations(word):
- """Given an English word, returns a collection of morphological variations
- on the word by algorithmically adding and removing suffixes. The variation
- list may contain non-words (e.g. render -> renderment).
- >>> variations("pull")
- set(['pull', 'pullings', 'pullnesses', 'pullful', 'pullment', 'puller', ... ])
- """
- if word in _exdict:
- return _exdict[word].split(" ")
- for i, p in enumerate(_partitions):
- match = p.search(word)
- if match:
- # Get the named group that matched
- num = int([k for k, v in iteritems(match.groupdict())
- if v is not None and k.startswith("_g")][0][2:])
- # Get the positional groups for the matched group (all other
- # positional groups are None)
- groups = [g for g in match.groups() if g is not None]
- ending = groups[-1]
- root = word[:0 - len(ending)] if ending else word
- out = set((word,))
- results = rules[i * _partition_size + num][1]
- for result in results.split(","):
- if result.startswith("&"):
- out.add(root + root[-1] + result[1:])
- elif result.startswith("*"):
- out.union(variations(root + result[1:]))
- else:
- out.add(root + result)
- return set(out)
- return [word]
- if __name__ == '__main__':
- import time
- t = time.clock()
- s = variations("rendering")
- print(time.clock() - t)
- print(len(s))