Saturday, August 20, 2016

How to use sqoop to copy MySQL tables to Hive

sqoop is useful when you need to copy MySQL tables to Hive

Here is an example to copy MySQL tables from different shards to one Hive table:


 #!/bin/bash  
   
 set -x  
 set -e  
   
 game=gameexample  
   
 mysql_host_prefix=userdb  
 mysql_host_suffix=myhostname.com  
 mysql_tab=user  
 mysql_database=mydatabase  
 mysql_user=myusername  
 mysql_pwd=xxxxx  
   
 hive_tab=user  
   
 echo "Log: `date` dropping Hive table $game.${hive_tab}"  
 hive -S -e "drop table \`$game.${hive_tab}\`;"   
   
 for SHARD_ID in {1..200}; do  
   
   mysql_host=${mysql_host_prefix}-${SHARD_ID}-${mysql_host_suffix}  
   mysql_conn="mysql --user=$mysql_user --password=$mysql_pwd -D${mysql_database} --host=${mysql_host} -s --skip-column-names"  
   hive_shard_tab=${hive_tab}_shard${SHARD_ID}  
   hdfs_dir=/user/mapr/${mysql_tab}_shard${SHARD_ID}  
   if ping -c 1 -W 1 "$mysql_host"; then  
     echo "Log: `date` $mysql_host is alive"  
   else  
     echo "Log: `date` $mysql_host is not alive"  
     exit 0  
   fi   
   
   echo "Log: `date` dropping Hive table $game.${hive_shard_tab}"  
   hive -S -e "drop table $game.${hive_shard_tab}"   
   echo "Log: `date` removing ${hdfs_dir} on HDFS"  
   hadoop fs -rm -r -f ${hdfs_dir}  
     
   sql="select count(*) from \`${mysql_tab}\`"  
   mysql_row_cnt=`echo "$sql" | $mysql_conn`  
   echo "Log: `date` found ${mysql_row_cnt} rows in the MySQL table ${mysql_tab} with query: $sql"  
     
   sqoop import \  
    --connect jdbc:mysql://$mysql_host/${mysql_database} \  
    --table "${mysql_tab}" \  
    --username $mysql_user \  
    --password $mysql_pwd \  
    --num-mappers 1 \  
    --hive-overwrite \  
    --hive-table $game.${hive_shard_tab} \  
    --hive-import \  
    --target-dir ${hdfs_dir} \  
    --hive-delims-replacement ' '   
     
   hive_row_cnt=`hive -S -e "select count(*) from $game.${hive_shard_tab}"`  
   echo "Log: `date` ended up with ${hive_row_cnt} rows in the Hive table $game.${hive_shard_tab} which are copied from the MySQL table ${mysql_tab} (${mysql_row_cnt} rows)"  
     
   # merging  
   if [ $SHARD_ID = 1 ]; then  
      sql_str="create table $game.\`$hive_tab\` as select * from $game.${hive_shard_tab};"  
      echo "Log: `date` creating the Hive table $game.${hive_tab} with the data from the first Shard with sql: $sql_str"  
      hive -S -e "$sql_str"   
   else  
      sql_str="insert into table $game.\`$hive_tab\` select * from $game.${hive_shard_tab};"  
      echo "Log: `date` merging into the Hive table $game.${hive_tab} the data from Shard $SHARD_ID with sql: $sql_str"  
      hive -S -e "$sql_str"   
   fi  
 done  
 exit 0  
   

No comments: