# download the latest database dump from:
http://dumps.wikimedia.org/backup-index.html
# look for "enwikiquote"
# then on the download page for wikiquote, look for an .xml.bz2 file that's 60 - 100 MB
# 'multistream' or not doesn't seem to make any difference; I used
# enwikiquote-20130817-pages-articles-multistream.xml.bz2

# then, perform the following replacements on it (notated as vim commands, but recommend
# that you use sed or some other stream editor)
g/^=/d
# note: ^M is Ctrl+M (a.k.a. the return key).
# to input this, type Ctrl+V, then Ctrl+M (or Enter). copying & pasting won't work!
%s/&lt;br \/&gt;/^M/g
%s/&lt;br&gt;/^M/g
%s/&lt;p&gt;/^M/g
%s/&lt;\/p&gt;/^M/g
%s/&lt;!--[^-]\+--&gt;//g
g/<.*>/d
%s/&[a-z]\{2,5\};//g
g/^{{[^}]*}}$/d

# there may be some quotes lost/buried inside metadata tags; recover them if you want:
grep "^|[a-z]\+=" compacted_quotes2 > out
# delete these lines from the main file
g/^|[a-z]\+=/d
# now look in 'out' and salvage any that are actually quotes

%s/\[\[[^]|]\+|/[[/g # run several times; there are some constructs like [[left| right| blah]]

%s/\*\* Variant: /** /
%s/\*\* Variant translation: /** /
%s/\*\* Translation: /** /
%s/\*\* English equivalent: /** /
%s/\*\* Meaning: /** /

# sort all lines by length:
cat filename | awk '{print length, $0;}' | sort -nr > out
# careful: don't use -u at the same time as -n; you'll nuke most of the file if you do
sort -u out > filename
rm out

# manually decide what looks "too short" and remove those lines

# remove line-size prefixes
%s/^[0-9]\+ //

# lowercase the entire file:
cat filename | tr '[:upper:]' '[:lower:]' > out

# remove all punctuation
%s/[^a-z]//g