Python: File Stemming dengan Sastrawi
From OnnoCenterWiki
import sys, getopt
import argparse
import os,nltk,os.path,re,string
import argparse
import Sastrawi
from nltk.stem.porter import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', default=, help='input filename')
parser.add_argument('-o', '--outfile', default=, help='output filename')
return parser.parse_args()
def hanya_huruf( input ):
r=re.match('^[a-zA-Z]+$', input)
if r==None:
return False
else:
return True
def main():
args = parse_args()
outfile = args.outfile
infile = args.infile
f = open(infile,"r")
fcontent = f.read()
lines = fcontent.split()
f.close()
factory = StemmerFactory()
stemmer = factory.create_stemmer()
f = open(outfile,"w")
for word in lines:
if hanya_huruf(word) and len(word)<20 and len(word)>1 and word!='Iing' :
word = word.strip(string.punctuation).lower()
word = stemmer.stem(word)
if word not in nltk.corpus.stopwords.words('english'):
if word not in nltk.corpus.stopwords.words('indonesian'):
f.write(word)
f.write(" ")
else:
pass
f.close()
main()