Possible reason that may fail 'Propagate' - 'write data locally' in LocalWriteSuite test · Issue #164 · tensorflow/ecosystem

ecosystem/spark/spark-tensorflow-connector/src/main/scala/org/tensorflow/spark/datasources/tfrecords/DefaultSource.scala

Lines 173 to 215 in 791a42f

    
           private def writePartitionLocal( 
        
               index: Int, 
        
               part: Iterator[(BytesWritable, NullWritable)], 
        
               localPath: String, 
        
               mode: SaveMode): Iterator[Int] = { 
        
             val dir = new File(localPath) 
        
             if (dir.exists()) { 
        
               if (mode == SaveMode.ErrorIfExists) { 
        
                 throw new IllegalStateException( 
        
                   s"LocalPath $localPath already exists. SaveMode: ErrorIfExists.") 
        
               } 
        
               if (mode == SaveMode.Ignore) { 
        
                 return Iterator.empty 
        
               } 
        
             } 
        
             // Make the directory if it does not exist 
        
             dir.mkdirs() 
        
             // The path to the partition file. 
        
             val filePath = localPath + s"/part-" + String.format("%05d", java.lang.Integer.valueOf(index)) 
        
             val fos = new DataOutputStream(new FileOutputStream(filePath)) 
        
             var count = 0 
        
             try { 
        
               val tfw = new TFRecordWriter(fos) 
        
               for((bw, _) <- part) { 
        
                 tfw.write(bw.getBytes) 
        
                 count += 1 
        
               } 
        
             } finally { 
        
               fos.close() 
        
             } 
        
             Iterator(count) 
        
           } 
        
           // Working around the closure variable captures. 
        
           private def writePartitionLocalFun( 
        
               localPath: String, 
        
               mode: SaveMode): (Int, Iterator[(BytesWritable, NullWritable)]) => Iterator[Int] = { 
        
             def mapFun(index: Int, part: Iterator[(BytesWritable, NullWritable)]) = { 
        
               writePartitionLocal(index, part, localPath, mode) 
        
             } 
        
             mapFun 
        
           }

The check if (dir.exists()) in line 179 above may cause subsequent partition write failed if partitions are more than 2 in the test below. Because partitions are written in a map in line 211 above, subsequent partition writes would fail when checking if (dir.exists()).

ecosystem/spark/spark-tensorflow-connector/src/test/scala/org/tensorflow/spark/datasources/tfrecords/LocalWriteSuite.scala

Lines 42 to 70 in 791a42f

    
           "Propagate" should { 
        
             "write data locally" in { 
        
               // Create a dataframe with 2 partitions 
        
               val rdd = spark.sparkContext.parallelize(testRows, numSlices = 2) 
        
               val df = spark.createDataFrame(rdd, schema) 
        
               // Write the partitions onto the local hard drive. Since it is going to be the 
        
               // local file system, the partitions will be written in the same directory of the 
        
               // same machine. 
        
               // In a distributed setting though, two different machines would each hold a single 
        
               // partition. 
        
               val localPath = Files.createTempDirectory("spark-connector-propagate").toAbsolutePath.toString 
        
               val savePath = localPath + "/testResult" 
        
               df.write.format("tfrecords") 
        
                 .option("recordType", "Example") 
        
                 .option("writeLocality", "local") 
        
                 .save(savePath) 
        
               // Read again this directory, this time using the Hadoop file readers, it should 
        
               // return the same data. 
        
               // This only works in this test and does not hold in general, because the partitions 
        
               // will be written on the workers. Everything runs locally for tests. 
        
               val df2 = spark.read.format("tfrecords").option("recordType", "Example") 
        
                 .load(savePath).sort("id").select("id", "IntegerTypeLabel", "LongTypeLabel", 
        
                 "FloatTypeLabel", "DoubleTypeLabel", "VectorLabel", "name") // Correct column order. 
        
               assert(df2.collect().toSeq === testRows.toSeq) 
        
             } 
        
           }

The exception thrown should be similar to #141 (comment)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Possible reason that may fail 'Propagate' - 'write data locally' in LocalWriteSuite test #164

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

	private def writePartitionLocal(
	index: Int,
	part: Iterator[(BytesWritable, NullWritable)],
	localPath: String,
	mode: SaveMode): Iterator[Int] = {
	val dir = new File(localPath)
	if (dir.exists()) {
	if (mode == SaveMode.ErrorIfExists) {
	throw new IllegalStateException(
	s"LocalPath $localPath already exists. SaveMode: ErrorIfExists.")
	}
	if (mode == SaveMode.Ignore) {
	return Iterator.empty
	}
	}

	// Make the directory if it does not exist
	dir.mkdirs()
	// The path to the partition file.
	val filePath = localPath + s"/part-" + String.format("%05d", java.lang.Integer.valueOf(index))
	val fos = new DataOutputStream(new FileOutputStream(filePath))
	var count = 0
	try {
	val tfw = new TFRecordWriter(fos)
	for((bw, _) <- part) {
	tfw.write(bw.getBytes)
	count += 1
	}
	} finally {
	fos.close()
	}
	Iterator(count)
	}

	// Working around the closure variable captures.
	private def writePartitionLocalFun(
	localPath: String,
	mode: SaveMode): (Int, Iterator[(BytesWritable, NullWritable)]) => Iterator[Int] = {
	def mapFun(index: Int, part: Iterator[(BytesWritable, NullWritable)]) = {
	writePartitionLocal(index, part, localPath, mode)
	}
	mapFun
	}

	"Propagate" should {
	"write data locally" in {
	// Create a dataframe with 2 partitions
	val rdd = spark.sparkContext.parallelize(testRows, numSlices = 2)
	val df = spark.createDataFrame(rdd, schema)

	// Write the partitions onto the local hard drive. Since it is going to be the
	// local file system, the partitions will be written in the same directory of the
	// same machine.
	// In a distributed setting though, two different machines would each hold a single
	// partition.
	val localPath = Files.createTempDirectory("spark-connector-propagate").toAbsolutePath.toString
	val savePath = localPath + "/testResult"
	df.write.format("tfrecords")
	.option("recordType", "Example")
	.option("writeLocality", "local")
	.save(savePath)

	// Read again this directory, this time using the Hadoop file readers, it should
	// return the same data.
	// This only works in this test and does not hold in general, because the partitions
	// will be written on the workers. Everything runs locally for tests.
	val df2 = spark.read.format("tfrecords").option("recordType", "Example")
	.load(savePath).sort("id").select("id", "IntegerTypeLabel", "LongTypeLabel",
	"FloatTypeLabel", "DoubleTypeLabel", "VectorLabel", "name") // Correct column order.

	assert(df2.collect().toSeq === testRows.toSeq)
	}
	}

Possible reason that may fail 'Propagate' - 'write data locally' in LocalWriteSuite test #164

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions