Python: NLTK stopwords: Difference between revisions

From OnnoCenterWiki
Jump to navigationJump to search
Onnowpurbo (talk | contribs)
Created page with "Directory ~/nltk_data/corpora/stopwords"
 
Onnowpurbo (talk | contribs)
No edit summary
Line 2: Line 2:


  ~/nltk_data/corpora/stopwords
  ~/nltk_data/corpora/stopwords
Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik
import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default='', help='input filename')
    return parser.parse_args()
args = parse_args()
infile = args.infile
filename = open(infile,'r')
fcontent=filename.read()
filename.close()
fs = fcontent.split()
wordlist=[]
for word in fs:
    word = ps.stem(word.strip(string.punctuation).lower())
    if word not in nltk.corpus.stopwords.words('english') and len(word)<15:
        if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15:
            if word not in wordlist:
                wordlist.append(word)
                print( word )
            else:
                pass
        else:
            pass
Masukan kata2 yang tidak ingin ada dalam text ke dalam file
~/nltk_data/corpora/stopwords/indonesia
contoh
saya
punya
sendiri
kami
kamu
anda
dia
mereka
jika
yang
itu
siapa
dengan
a
b
c
d
e
f
..
..
1
2
3
4
5
..
..
01/1/2017
02/1/2017
03/1/2017
04/1/2017
05/1/2017
..
..
00:00
00:01
00:02
00:03
00:04

Revision as of 23:52, 30 January 2017

Directory

~/nltk_data/corpora/stopwords


Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik

import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer

ps=PorterStemmer()

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default=, help='input filename')
    return parser.parse_args()

args = parse_args()
infile = args.infile

filename = open(infile,'r')
fcontent=filename.read()
filename.close()
fs = fcontent.split()
wordlist=[]

for word in fs:
    word = ps.stem(word.strip(string.punctuation).lower())
    if word not in nltk.corpus.stopwords.words('english') and len(word)<15:
       if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15:
           if word not in wordlist:
               wordlist.append(word)
               print( word )
           else:
               pass
       else:
           pass


Masukan kata2 yang tidak ingin ada dalam text ke dalam file

~/nltk_data/corpora/stopwords/indonesia

contoh

saya
punya
sendiri
kami
kamu
anda
dia
mereka
jika
yang
itu
siapa
dengan
a
b
c
d
e
f
..
..
1
2
3
4
5
..
..
01/1/2017
02/1/2017
03/1/2017
04/1/2017
05/1/2017
..
..
00:00
00:01
00:02
00:03
00:04