From 1118c77c2f0894120d373db4ab318701e944b4f5 Mon Sep 17 00:00:00 2001 From: Thang Trinh Date: Mon, 18 Apr 2016 17:01:07 +0700 Subject: [PATCH] fix incorrect vocabs when loading from utf-8 binary --- .../java/com/medallia/word2vec/Word2VecModel.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/medallia/word2vec/Word2VecModel.java b/src/main/java/com/medallia/word2vec/Word2VecModel.java index 5fa6b25..675f6e9 100644 --- a/src/main/java/com/medallia/word2vec/Word2VecModel.java +++ b/src/main/java/com/medallia/word2vec/Word2VecModel.java @@ -161,19 +161,24 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder, Profilin long lastLogMessage = System.currentTimeMillis(); final float[] floats = new float[layerSize]; + + ByteBuffer bb = ByteBuffer.allocate(1000); + byte b; for (int lineno = 0; lineno < vocabSize; lineno++) { // read vocab - sb.setLength(0); - c = (char) buffer.get(); + bb.rewind(); + b = buffer.get(); + c = (char)b; while (c != ' ') { // ignore newlines in front of words (some binary files have newline, // some don't) if (c != '\n') { - sb.append(c); + bb.put(b); } - c = (char) buffer.get(); + b = buffer.get(); + c = (char)b; } - vocabs.add(sb.toString()); + vocabs.add(new String(bb.array(), 0, bb.position(), Charset.forName("utf-8"))); // read vector final FloatBuffer floatBuffer = buffer.asFloatBuffer();