# download the latest database dump from: http://dumps.wikimedia.org/backup-index.html # look for "enwikiquote" # then on the download page for wikiquote, look for an .xml.bz2 file that's 60 - 100 MB # 'multistream' or not doesn't seem to make any difference; I used # enwikiquote-20130817-pages-articles-multistream.xml.bz2 # then, perform the following replacements on it (notated as vim commands, but recommend # that you use sed or some other stream editor) g/^=/d # note: ^M is Ctrl+M (a.k.a. the return key). # to input this, type Ctrl+V, then Ctrl+M (or Enter). copying & pasting won't work! %s/<br \/>/^M/g %s/<br>/^M/g %s/<p>/^M/g %s/<\/p>/^M/g %s/<!--[^-]\+-->//g g/<.*>/d %s/&[a-z]\{2,5\};//g g/^{{[^}]*}}$/d # there may be some quotes lost/buried inside metadata tags; recover them if you want: grep "^|[a-z]\+=" compacted_quotes2 > out # delete these lines from the main file g/^|[a-z]\+=/d # now look in 'out' and salvage any that are actually quotes %s/\[\[[^]|]\+|/[[/g # run several times; there are some constructs like [[left| right| blah]] %s/\*\* Variant: /** / %s/\*\* Variant translation: /** / %s/\*\* Translation: /** / %s/\*\* English equivalent: /** / %s/\*\* Meaning: /** / # sort all lines by length: cat filename | awk '{print length, $0;}' | sort -nr > out # careful: don't use -u at the same time as -n; you'll nuke most of the file if you do sort -u out > filename rm out # manually decide what looks "too short" and remove those lines # remove line-size prefixes %s/^[0-9]\+ // # lowercase the entire file: cat filename | tr '[:upper:]' '[:lower:]' > out # remove all punctuation %s/[^a-z]//g