I am trying to get the first letter of all the values in column ‘word’ but getting error.
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import spark.implicits._
// Define case classes for input data
case class Docword(docId: Int, vocabId: Int, count: Int)
case class VocabWord(vocabId: Int, word: String)
// Read the input data
val docwords = spark.read.
schema(Encoders.product[Docword].schema).
option("delimiter", " ").
csv("hdfs:///user/ashhall1616/bdc_data/t3/docword.txt").
as[Docword]
val vocab = spark.read.
schema(Encoders.product[VocabWord].schema).
option("delimiter", " ").
csv("hdfs:///user/ashhall1616/bdc_data/t3/vocab.txt").
as[VocabWord]
def firstletter(x: String): String = {
x.substring(0,1)}
val firstletterUdf =spark.udf.regster[String,String]("firstletter", firstletter(_))
val joinfile = docwords.join(vocab, "vocabId").select($"word", $"docId", $"count").withColumn("firstletter", firstletterUdf($"word"))
joinfile.write.mode("overwrite").partitionBy("firstletter").parquet("file:///home/user204943816622/t3_docword_index_part.parquet")
joinfile.show(10)
ERROR:
val firstletterUdf =spark.udf.regster[String,String]("firstletter", firstletter(_))
<console>:100: error: value regster is not a member of org.apache.spark.sql.UDFRegistration
val firstletterUdf =spark.udf.regster[String,String]("firstletter", firstletter(_))
^
scala> val joinfile = docwords.join(vocab, "vocabId").select($"word", $"docId", $"count").withColumn("firstletter", firstletterUdf($"word"))
<console>:106: error: not found: value firstletterUdf
val joinfile = docwords.join(vocab, "vocabId").select($"word", $"docId", $"count").withColumn("firstletter", firstletterUdf($"word"))
Want to get output as :
|word|docId|count|firstLetter
plane| 1| 1000| p
Please suggest.