# Wds.pm Daniel Brockman 070223 Wordcounting # Creative Commons Attribution License. # Permission granted to copy, use or modify, with attribution to author. # # Usage: use Wds.pm # use strict; #__________________________________________________________ sub wfil { # # # Usage: $rfreq = &wfil($rf,$rlines) # # $rlines is the reference to an array, to which wfil # appends, that contains the lines read from a file. # # Either # $rf is a reference to a file handle # or # $rf is a file name. # if $rf is a reference to a file handle, then the file # must be open already on the file handle, and sub wfil won't close it. # if $rf is a file name, then sub wfil will open it, read it and close it. # wfil first assumes $rf is a file name and attempts to open it. # if wfil can't open the file, then wfil assumes $rf is a file handle # and attempts to read from it, and if the file isn't open, then # a fatal error will occur. # # wfil appends the content of the text file to array @$rlines. # # wfrq.pl Daniel Brockman 20030828 Word Frequency Counter # modified 070223 my ( $rf, # text input file name or handle $rlines, # ref to array of lines from file ) = @_; my ( @raw, # lines read from file $i,$j,$k, # aux $fn, # file name $rfh, # ref to file handle $isfile, # flag = 1 if $rf is file name ); $isfile=1; # assume rf is a file name if (!open (FH,$rf)) { # attempt to open file $isfile=0; # rf isn't a file $rfh=$rf; # we'll use $rf as a file handle } if ($isfile) {$rfh=\*FH;} # rf is file name, rfh is handle @raw=<$rfh> ; # suck in the whole file push(@$rlines,@raw) ; # append to passed array if ($isfile) {close($rfh)} # close file if opened } # end sub wfil #__________________________________________________________ #__________________________________________________________ sub wftl { # # # Usage: $rfreq = &wftl($rfreq,$rword) # # $rword is the reference to a hash, # for which the keys are numeric counts of word frequency. # One way to construct %$rfreq is by sub wfin. # Hash element order not guaranteed. # One way to obtain order is # @sortord = sort { $b <=> $a } keys %{$rword} # # $rfreq is the reference to a hash, to which wftl appends, # for which the keys are words. # Hash element order not guaranteed. # One way to obtain order is # @sortord = sort keys %{$rfreq} # # wfrq.pl Daniel Brockman 20030828 Word Frequency Counter # modified 070223 my ( $rfreq, # hash ref, { count => word, ... } $rword, # hash ref, { word => count, ... } ) = @_; my ( #@content, # file content #@token, # line content $i,$j,$k, # aux #@list, # aux $word, $line, # items read from line, from file @keys, # hash keys # @numkeys, # number keys ); @keys=keys %{$rword}; # get hash keys (words) for $word (@keys) { # take each unique word $j=${$rword}{$word}; # count of occurrences of word if((!defined(${$rfreq}{$j})) or (${$rfreq}{$j} eq "") ){ # unranked? ${$rfreq}{$j}=$word; # init list } else { ${$rfreq}{$j}=${$rfreq}{$j}." ".$word; # append to list } # if {$rfreq} j } # for word keys ##################### test ############################# ### write list of frequencies ## ##for $i (@numkeys) { ## for $word (@list=split(/ /,${$rfreq}{$i})) { ## print "$word $i\n"; ## } # for word ##} # for i numkeys ################### end test ########################### } # end wftl # #__________________________________________________________ #__________________________________________________________ sub wfee { # # # Usage: $rfreq = &wfee($rlines,$rword) # # # $rlines is the reference to an array # that contains the lines read from a file. # sub wfee doesn't change @$rlines. # One way to create the array @$rlines is to use # sub wfil. # # $rword is the reference to a hash, to which wfee appends, # for which the keys are numeric counts of word frequency. # Hash element order not guaranteed. # One way to obtain order is # @sortord = sort { $b <=> $a } keys %{$rword} # # wfee separates words on any character other than a # digit or a letter, ignoring all punctuation. wfee # ignores very frequently occurring words including "a", # "and" and "the" and some others. # # wfee considers two words as # the same word if their first $nl letters are the same. # Only the first $nl characters are shown in hash %$rfreq. # # $nl is a parameter currently set to 6. # # wfrq.pl Daniel Brockman 20030828 Word Frequency Counter # modified 070223 my ( $rlines, # ref to array of lines from file $rword, # hash ref, { word => count, ... } ) = @_; my ( @token, # line content $i,$j,$k, # aux $word, $line, # items read from line, from file $common, # flag for common words @commoners, # common words $nl, # number of letters in a "word" $fn, # file name $rfh, # ref to file handle $isfile, # flag = 1 if $rf is file name ); # init $nl=6; # list of common words @commoners=( "a","and","the","of","to","in","with","for","or","is", "this","as","will","it","be","are","an","s" ); foreach $line (@$rlines) { # loop the lines if (!defined($line)){next} # skip unassigned $line=~tr/A-Z/a-z/ ; # lowercase @token=split(/[^0-9a-z]+/,$line); # separate out words for $word (@token) { # loop the words $word=substr($word,0,$nl); # ch in one word $common=0; # assume not common for $j (@commoners) { # compare with common words if($common=($word eq $j)){ last }; }; # for j commoners if($common) { next }; # skip if common ${$rword}{$word}++; # count occurrences $i = ${$rword}{$word}; # test } # for $word } # foreach $line } # end sub wfee #__________________________________________________________ #__________________________________________________________ #__________________________________________________________ 1