hadoop学习之路(6)_zs浪里小白龙

网络投稿 02-07 7721

1.hive

conf/hive-log4j.properties

# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://·mand=tail -f /opt/module/hive/logs/hive.log #定义chanel a1.channels.c1.type=memory a1.channels.c1.capacity=1000 #定义sink a1.sinks.k1.type = hdfs #一旦路径中含有基于时间的转义序列，要求event的header中必须有timestamp=时间戳，如果没有需要将useLocalTimeStamp = true a1.sinks.k1.hdfs.path = hdfs://hadoop101:9000/flume/%Y%m%d/%H/%M #上传文件的前缀 a1.sinks.k1.hdfs.filePrefix = logs- #以下三个和目录的滚动相关，目录一旦设置了时间转义序列，基于时间戳滚动 #是否将时间戳向下舍 a1.sinks.k1.hdfs.round = true #多少时间单位创建一个新的文件夹 a1.sinks.k1.hdfs.roundValue = 1 #重新定义时间单位 a1.sinks.k1.hdfs.roundUnit = minute #是否使用本地时间戳 a1.sinks.k1.hdfs.useLocalTimeStamp = true #积攒多少个Event才flush到HDFS一次 a1.sinks.k1.hdfs.batchSize = 100 #以下三个和文件的滚动相关，以下三个参数是或的关系！以下三个参数如果值为0都代表禁用！ #60秒滚动生成一个新的文件 a1.sinks.k1.hdfs.rollInterval = 10 #设置每个文件到128M时滚动 a1.sinks.k1.hdfs.rollSize = 134217700 #每写多少个event滚动一次 a1.sinks.k1.hdfs.rollCount = 0 #以不压缩的文本形式保存数据 a1.sinks.k1.hdfs.fileType=DataStream #连接组件同一个source可以对接多个channel，一个sink只能从一个channel拿数据！ a1.sources.r1.channels=c1 a1.sinks.k1.channel=c1 View Code

taildirsource-loggersink.conf

#a1是agent的名称，a1中定义了一个叫r1的source，如果有多个，使用空格间隔 a1.sources = r1 a1.sinks = k1 a1.channels = c1 #组名名.属性名=属性值 a1.sources.r1.type=TAILDIR a1.sources.r1.filegroups=f1 f2 a1.sources.r1.filegroups.f1=/home/layman/hi a1.sources.r1.filegroups.f2=/home/layman/test #定义sink a1.sinks.k1.type=logger a1.sinks.k1.maxBytesToLog=100 #定义chanel a1.channels.c1.type=memory a1.channels.c1.capacity=1000 #连接组件同一个source可以对接多个channel，一个sink只能从一个channel拿数据！ a1.sources.r1.channels=c1 a1.sinks.k1.channel=c1 View Code

spoolingdirsource-hdfsink.conf

#a1是agent的名称，a1中定义了一个叫r1的source，如果有多个，使用空格间隔 a1.sources = r1 a1.sinks = k1 a1.channels = c1 #组名名.属性名=属性值 a1.sources.r1.type=spooldir a1.sources.r1.spoolDir=/home/layman/flume #定义chanel a1.channels.c1.type=memory a1.channels.c1.capacity=1000 #定义sink a1.sinks.k1.type = hdfs #一旦路径中含有基于时间的转义序列，要求event的header中必须有timestamp=时间戳，如果没有需要将useLocalTimeStamp = true a1.sinks.k1.hdfs.path = hdfs://hadoop101:9000/flume/%Y%m%d/%H/%M #上传文件的前缀 a1.sinks.k1.hdfs.filePrefix = logs- #以下三个和目录的滚动相关，目录一旦设置了时间转义序列，基于时间戳滚动 #是否将时间戳向下舍 a1.sinks.k1.hdfs.round = true #多少时间单位创建一个新的文件夹 a1.sinks.k1.hdfs.roundValue = 1 #重新定义时间单位 a1.sinks.k1.hdfs.roundUnit = minute #是否使用本地时间戳 a1.sinks.k1.hdfs.useLocalTimeStamp = true #积攒多少个Event才flush到HDFS一次 a1.sinks.k1.hdfs.batchSize = 100 #以下三个和文件的滚动相关，以下三个参数是或的关系！以下三个参数如果值为0都代表禁用！ #60秒滚动生成一个新的文件 a1.sinks.k1.hdfs.rollInterval = 30 #设置每个文件到128M时滚动 a1.sinks.k1.hdfs.rollSize = 134217700 #每写多少个event滚动一次 a1.sinks.k1.hdfs.rollCount = 0 #以不压缩的文本形式保存数据 a1.sinks.k1.hdfs.fileType=DataStream #连接组件同一个source可以对接多个channel，一个sink只能从一个channel拿数据！ a1.sources.r1.channels=c1 a1.sinks.k1.channel=c1 View Code

avrosource-loggersink.conf

#agent2 #a1是agent的名称，a1中定义了一个叫r1的source，如果有多个，使用空格间隔 a1.sources = r1 a1.sinks = k1 a1.channels = c1 #组名名.属性名=属性值 a1.sources.r1.type=avro a1.sources.r1.bind=hadoop102 a1.sources.r1.port=33333 #定义sink a1.sinks.k1.type=logger #定义chanel a1.channels.c1.type=memory a1.channels.c1.capacity=1000 #连接组件同一个source可以对接多个channel，一个sink只能从一个channel拿数据！ a1.sources.r1.channels=c1 a1.sinks.k1.channel=c1 View Code

netcatsource-avrosink.conf

#agent1 #a1是agent的名称，a1中定义了一个叫r1的source，如果有多个，使用空格间隔 a1.sources = r1 a1.sinks = k1 a1.channels = c1 #组名名.属性名=属性值 a1.sources.r1.type=netcat a1.sources.r1.bind=hadoop101 a1.sources.r1.port=44444 #定义sink a1.sinks.k1.type=avro a1.sinks.k1.hostname=hadoop102 a1.sinks.k1.port=33333 #定义chanel a1.channels.c1.type=memory a1.channels.c1.capacity=1000 #连接组件同一个source可以对接多个channel，一个sink只能从一个channel拿数据！ a1.sources.r1.channels=c1 a1.sinks.k1.channel=c1 View Code

注意上面两个agent的启动顺序

3.kafka

conf/server.properties

# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://·/javase/6/docs/technotes/guides/management/agent.html # NOTE: HBase provides an alternative JMX implementation to fix the random ports issue, please see JMX # section in HBase Reference Guide for instructions. # export HBASE_JMX_BASE="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false" # export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10101" # export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10102" # export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10103" # export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10104" # export HBASE_REST_OPTS="$HBASE_REST_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10105" # File naming hosts on which HRegionServers will run. $HBASE_HOME/conf/regionservers by default. # export HBASE_REGIONSERVERS=${HBASE_HOME}/conf/regionservers # Uncomment and adjust to keep all the Region Server pages mapped to be memory resident #HBASE_REGIONSERVER_MLOCK=true #HBASE_REGIONSERVER_UID="hbase" # File naming hosts on which backup HMaster will run. $HBASE_HOME/conf/backup-masters by default. # export HBASE_BACKUP_MASTERS=${HBASE_HOME}/conf/backup-masters # Extra ssh options. Empty by default. # export HBASE_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HBASE_CONF_DIR" # Where log files are stored. $HBASE_HOME/logs by default. # export HBASE_LOG_DIR=${HBASE_HOME}/logs # Enable remote JDWP debugging of major HBase processes. Meant for Core Developers # export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8070" # export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8071" # export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8072" # export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8073" # A string representing this instance of hbase. $USER by default. # export HBASE_IDENT_STRING=$USER # The scheduling priority for daemon processes. See 'man nice'. # export HBASE_NICENESS=10 # The directory where pid files are stored. /tmp by default. # export HBASE_PID_DIR=/var/hadoop/pids # Seconds to sleep between slave commands. Unset by default. This # can be useful in large clusters, where, e.g., slave rsyncs can # otherwise arrive faster than the master can service them. # export HBASE_SLAVE_SLEEP=0.1 # Tell HBase whether it should manage it's own instance of Zookeeper or not. export HBASE_MANAGES_ZK=false # The default log rolling policy is RFA, where the log file is rolled as per the size defined for the # RFA appender. Please refer to the log4j.properties file to see more details on this appender. # In case one needs to do log rolling on a date change, one should set the environment property # HBASE_ROOT_LOGGER to "<DESIRED_LOG LEVEL>,DRFA". # For example: # HBASE_ROOT_LOGGER=INFO,DRFA # The reason for changing default to RFA is to avoid the boundary case of filling out disk space as # DRFA doesn't put any cap on the log size. Please refer to HBase-5655 for more context. View Code

5.phoenix

注意继承HBase 的配置,hbase中配置二级索引,zookeeper集群配置的时候端口号省略

6.sqoop?

配置相关的HADOOP相关的环境变量,由于etc/profile已经配置全局,因此不需要指定

?hadoop,zookeeper,hive,hbase,phoenix,sqoop启动一堆效果