1 /* 2 * Copyright (C) 2019 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.android.tradefed.device.metric; 17 18 import static com.android.tradefed.util.proto.TfMetricProtoUtil.stringToMetric; 19 20 import com.android.os.AtomsProto.Atom; 21 import com.android.os.StatsLog.EventMetricData; 22 import com.android.os.StatsLog.StatsdStatsReport; 23 import com.android.tradefed.config.OptionClass; 24 import com.android.tradefed.device.DeviceNotAvailableException; 25 import com.android.tradefed.device.ITestDevice; 26 import com.android.tradefed.log.LogUtil.CLog; 27 import com.android.tradefed.metrics.proto.MetricMeasurement.Metric; 28 import com.android.tradefed.util.statsd.ConfigUtil; 29 import com.android.tradefed.util.statsd.MetricUtil; 30 31 import com.google.common.annotations.VisibleForTesting; 32 import com.google.common.collect.Iterables; 33 import com.google.protobuf.InvalidProtocolBufferException; 34 35 import java.io.IOException; 36 import java.text.SimpleDateFormat; 37 import java.util.Arrays; 38 import java.util.ArrayList; 39 import java.util.Date; 40 import java.util.HashMap; 41 import java.util.List; 42 import java.util.Map; 43 import java.util.NoSuchElementException; 44 import java.util.concurrent.TimeUnit; 45 import java.util.stream.Collectors; 46 47 /** 48 * Collector that collects timestamps of runtime restarts (system server crashes) during the test 49 * run, if any. 50 * 51 * <p>Outputs results in counts, wall clock time in seconds and in HH:mm:ss format, and system 52 * uptime in nanoseconds and HH:mm:ss format. 53 * 54 * <p>This collector uses two sources for system server crashes: 55 * 56 * <p> 57 * 58 * <ol> 59 * <li>The system_restart_sec list from StatsdStatsReport, which is a rolling list of 20 60 * timestamps when the system server crashes, in seconds, with newer crashes appended to the 61 * end (when the list fills up, older timestamps fall off the beginning). 62 * <li>The AppCrashOccurred statsd atom, where a system server crash shows up as a system_server 63 * process crash (this behavior is documented in the statsd atoms.proto definition). The event 64 * metric gives the device uptime when the crash occurs. 65 * </ol> 66 * 67 * <p>Both can be useful information, as the former makes it easy to correlate timestamps in logs, 68 * while the latter serves as a longevity metric. 69 */ 70 @OptionClass(alias = "runtime-restart-collector") 71 public class RuntimeRestartCollector extends BaseDeviceMetricCollector { 72 private static final String METRIC_SEP = "-"; 73 public static final String METRIC_PREFIX = "runtime-restart"; 74 public static final String METRIC_SUFFIX_COUNT = "count"; 75 public static final String METRIC_SUFFIX_SYSTEM_TIMESTAMP_SECS = "timestamps_secs"; 76 public static final String METRIC_SUFFIX_SYSTEM_TIMESTAMP_FORMATTED = "timestamps_str"; 77 public static final String METRIC_SUFFIX_UPTIME_NANOS = "uptime_nanos"; 78 public static final String METRIC_SUFFIX_UPTIME_FORMATTED = "uptime_str"; 79 public static final SimpleDateFormat TIME_FORMATTER = new SimpleDateFormat("HH:mm:ss"); 80 public static final String SYSTEM_SERVER_KEYWORD = "system_server"; 81 82 private List<ITestDevice> mTestDevices; 83 // Map to store the runtime restart timestamps for each device, keyed by device serial numbers. 84 private Map<String, List<Integer>> mExistingTimestamps = new HashMap<>(); 85 // Map to store the statsd config IDs for each device, keyed by device serial numbers. 86 private Map<String, Long> mDeviceConfigIds = new HashMap<>(); 87 // Flag variable to indicate whether including the device serial in the metrics is needed. 88 // Device serial is included when there are more than one device under test. 89 private boolean mIncludeDeviceSerial = false; 90 91 /** 92 * Store the existing timestamps of system server restarts prior to the test run as statsd keeps 93 * a running log of them, and push the config to collect app crashes. 94 */ 95 @Override onTestRunStart(DeviceMetricData runData)96 public void onTestRunStart(DeviceMetricData runData) { 97 mTestDevices = getDevices(); 98 mIncludeDeviceSerial = (mTestDevices.size() > 1); 99 for (ITestDevice device : mTestDevices) { 100 // Pull statsd metadata (StatsdStatsReport) from the device and keep track of existing 101 // runtime restart timestamps. 102 try { 103 List<Integer> existingTimestamps = 104 getStatsdMetadata(device).getSystemRestartSecList(); 105 mExistingTimestamps.put(device.getSerialNumber(), existingTimestamps); 106 } catch (DeviceNotAvailableException | InvalidProtocolBufferException e) { 107 // Error is not thrown as we still want to process the other devices. 108 CLog.e( 109 "Failed to get statsd metadata from device %s. Exception: %s.", 110 device.getSerialNumber(), e); 111 } 112 113 // Register statsd config to collect app crashes. 114 try { 115 mDeviceConfigIds.put( 116 device.getSerialNumber(), 117 pushStatsConfig( 118 device, Arrays.asList(Atom.APP_CRASH_OCCURRED_FIELD_NUMBER))); 119 } catch (DeviceNotAvailableException | IOException e) { 120 // Error is not thrown as we still want to push the config to other devices. 121 CLog.e( 122 "Failed to push statsd config to device %s. Exception: %s.", 123 device.getSerialNumber(), e); 124 } 125 } 126 } 127 128 /** 129 * Pull the timestamps at the end of the test run and report the difference with existing ones, 130 * if any. 131 */ 132 @Override onTestRunEnd( DeviceMetricData runData, final Map<String, Metric> currentRunMetrics)133 public void onTestRunEnd( 134 DeviceMetricData runData, final Map<String, Metric> currentRunMetrics) { 135 for (ITestDevice device : mTestDevices) { 136 // Pull statsd metadata again to look at the changes of system server crash timestamps 137 // during the test run, and report them. 138 // A copy of the list is created as the list returned by the proto is not modifiable. 139 List<Integer> updatedTimestamps = new ArrayList<>(); 140 try { 141 updatedTimestamps.addAll(getStatsdMetadata(device).getSystemRestartSecList()); 142 } catch (DeviceNotAvailableException | InvalidProtocolBufferException e) { 143 // Error is not thrown as we still want to process the other devices. 144 CLog.e( 145 "Failed to get statsd metadata from device %s. Exception: %s.", 146 device.getSerialNumber(), e); 147 continue; 148 } 149 if (!mExistingTimestamps.containsKey(device.getSerialNumber())) { 150 CLog.e("No prior state recorded for device %s.", device.getSerialNumber()); 151 addStatsdStatsBasedMetrics( 152 currentRunMetrics, updatedTimestamps, device.getSerialNumber()); 153 } else { 154 try { 155 int lastTimestampBeforeTestRun = 156 Iterables.getLast(mExistingTimestamps.get(device.getSerialNumber())); 157 // If the last timestamp is not found, lastIndexOf(...) + 1 returns 0 which is 158 // what is needed in that situation. 159 addStatsdStatsBasedMetrics( 160 currentRunMetrics, 161 getAllValuesAfter(lastTimestampBeforeTestRun, updatedTimestamps), 162 device.getSerialNumber()); 163 } catch (NoSuchElementException e) { 164 // The exception occurs when no prior runtime restarts had been recorded. It is 165 // thrown from Iterables.getLast(). 166 addStatsdStatsBasedMetrics( 167 currentRunMetrics, updatedTimestamps, device.getSerialNumber()); 168 } 169 } 170 } 171 172 // Pull the AppCrashOccurred event metric reports and report the system server crashes 173 // within. 174 for (ITestDevice device : mTestDevices) { 175 if (!mDeviceConfigIds.containsKey(device.getSerialNumber())) { 176 CLog.e("No config ID is associated with device %s.", device.getSerialNumber()); 177 continue; 178 } 179 long configId = mDeviceConfigIds.get(device.getSerialNumber()); 180 List<Long> uptimeListNanos = new ArrayList<>(); 181 try { 182 List<EventMetricData> metricData = getEventMetricData(device, configId); 183 uptimeListNanos.addAll( 184 metricData 185 .stream() 186 .filter(d -> d.hasElapsedTimestampNanos()) 187 .filter(d -> d.hasAtom()) 188 .filter(d -> d.getAtom().hasAppCrashOccurred()) 189 .filter(d -> d.getAtom().getAppCrashOccurred().hasProcessName()) 190 .filter( 191 d -> 192 SYSTEM_SERVER_KEYWORD.equals( 193 d.getAtom() 194 .getAppCrashOccurred() 195 .getProcessName())) 196 .map(d -> d.getElapsedTimestampNanos()) 197 .collect(Collectors.toList())); 198 } catch (DeviceNotAvailableException e) { 199 // Error is not thrown as we still want to process other devices. 200 CLog.e( 201 "Failed to retrieve event metric data from device %s. Exception: %s.", 202 device.getSerialNumber(), e); 203 } 204 addAtomBasedMetrics(currentRunMetrics, uptimeListNanos, device.getSerialNumber()); 205 try { 206 removeConfig(device, configId); 207 } catch (DeviceNotAvailableException e) { 208 // Error is not thrown as we still want to remove the config from other devices. 209 CLog.e( 210 "Failed to remove statsd config from device %s. Exception: %s.", 211 device.getSerialNumber(), e); 212 } 213 } 214 } 215 216 /** Helper method to add metrics from StatsdStatsReport according to timestamps. */ addStatsdStatsBasedMetrics( final Map<String, Metric> metrics, List<Integer> timestampsSecs, String serial)217 private void addStatsdStatsBasedMetrics( 218 final Map<String, Metric> metrics, List<Integer> timestampsSecs, String serial) { 219 // Always add a count of system server crashes, regardless of whether there are any. 220 // The statsd metadata-based count is used as the atom-based data can miss runtime restart 221 // instances. 222 // TODO(b/135770315): Re-assess this after the root cause for missing instances in the atom 223 // -based results is found. 224 String countMetricKey = createMetricKey(METRIC_SUFFIX_COUNT, serial); 225 metrics.put(countMetricKey, stringToMetric(String.valueOf(timestampsSecs.size()))); 226 // If there are runtime restarts, add a comma-separated list of timestamps. 227 if (!timestampsSecs.isEmpty()) { 228 // Store both the raw timestamp and the formatted, more readable version. 229 230 String timestampMetricKey = 231 createMetricKey(METRIC_SUFFIX_SYSTEM_TIMESTAMP_SECS, serial); 232 metrics.put( 233 timestampMetricKey, 234 stringToMetric( 235 timestampsSecs 236 .stream() 237 .map(t -> String.valueOf(t)) 238 .collect(Collectors.joining(",")))); 239 240 String formattedTimestampMetricKey = 241 createMetricKey(METRIC_SUFFIX_SYSTEM_TIMESTAMP_FORMATTED, serial); 242 metrics.put( 243 formattedTimestampMetricKey, 244 stringToMetric( 245 timestampsSecs 246 .stream() 247 .map(t -> timestampToHoursMinutesSeconds(t)) 248 .collect(Collectors.joining(",")))); 249 } 250 } 251 252 /** Helper method to add metrics from the AppCrashOccurred atoms according to timestamps. */ addAtomBasedMetrics( final Map<String, Metric> metrics, List<Long> timestampsNanos, String serial)253 private void addAtomBasedMetrics( 254 final Map<String, Metric> metrics, List<Long> timestampsNanos, String serial) { 255 // If there are runtime restarts, add a comma-separated list of device uptime timestamps. 256 if (!timestampsNanos.isEmpty()) { 257 // Store both the raw timestamp and the formatted, more readable version. 258 259 String uptimeNanosMetricKey = createMetricKey(METRIC_SUFFIX_UPTIME_NANOS, serial); 260 metrics.put( 261 uptimeNanosMetricKey, 262 stringToMetric( 263 timestampsNanos 264 .stream() 265 .map(t -> String.valueOf(t)) 266 .collect(Collectors.joining(",")))); 267 268 String formattedUptimeMetricKey = 269 createMetricKey(METRIC_SUFFIX_UPTIME_FORMATTED, serial); 270 metrics.put( 271 formattedUptimeMetricKey, 272 stringToMetric( 273 timestampsNanos 274 .stream() 275 .map(t -> nanosToHoursMinutesSeconds(t)) 276 .collect(Collectors.joining(",")))); 277 } 278 } 279 createMetricKey(String suffix, String serial)280 private String createMetricKey(String suffix, String serial) { 281 return mIncludeDeviceSerial 282 ? String.join(METRIC_SEP, METRIC_PREFIX, serial, suffix) 283 : String.join(METRIC_SEP, METRIC_PREFIX, suffix); 284 } 285 286 /** 287 * Forwarding logic to {@link ConfigUtil} for testing as it is a static utility class. 288 * 289 * @hide 290 */ 291 @VisibleForTesting pushStatsConfig(ITestDevice device, List<Integer> eventAtomIds)292 protected long pushStatsConfig(ITestDevice device, List<Integer> eventAtomIds) 293 throws IOException, DeviceNotAvailableException { 294 return ConfigUtil.pushStatsConfig(device, eventAtomIds); 295 } 296 297 /** 298 * Forwarding logic to {@link ConfigUtil} for testing as it is a static utility class. 299 * 300 * @hide 301 */ 302 @VisibleForTesting removeConfig(ITestDevice device, long configId)303 protected void removeConfig(ITestDevice device, long configId) 304 throws DeviceNotAvailableException { 305 ConfigUtil.removeConfig(device, configId); 306 } 307 308 /** 309 * Forwarding logic to {@link MetricUtil} for testing as it is a static utility class. 310 * 311 * @hide 312 */ 313 @VisibleForTesting getStatsdMetadata(ITestDevice device)314 protected StatsdStatsReport getStatsdMetadata(ITestDevice device) 315 throws DeviceNotAvailableException, InvalidProtocolBufferException { 316 return MetricUtil.getStatsdMetadata(device); 317 } 318 319 /** 320 * Forwarding logic to {@link MetricUtil} for testing as it is a static utility class. 321 * 322 * @hide 323 */ 324 @VisibleForTesting getEventMetricData(ITestDevice device, long configId)325 protected List<EventMetricData> getEventMetricData(ITestDevice device, long configId) 326 throws DeviceNotAvailableException { 327 return MetricUtil.getEventMetricData(device, configId); 328 } 329 330 /** 331 * Helper method to convert nanoseconds to HH:mm:ss. {@link SimpleDateFormat} is not used here 332 * as the input is not a real epoch-based timestamp. 333 */ nanosToHoursMinutesSeconds(long nanos)334 private String nanosToHoursMinutesSeconds(long nanos) { 335 long hours = TimeUnit.NANOSECONDS.toHours(nanos); 336 nanos -= TimeUnit.HOURS.toNanos(hours); 337 long minutes = TimeUnit.NANOSECONDS.toMinutes(nanos); 338 nanos -= TimeUnit.MINUTES.toNanos(minutes); 339 long seconds = TimeUnit.NANOSECONDS.toSeconds(nanos); 340 return String.format("%02d:%02d:%02d", hours, minutes, seconds); 341 } 342 343 /** Helper method to format an epoch-based timestamp in seconds to HH:mm:ss. */ timestampToHoursMinutesSeconds(int seconds)344 private String timestampToHoursMinutesSeconds(int seconds) { 345 return TIME_FORMATTER.format(new Date(TimeUnit.SECONDS.toMillis(seconds))); 346 } 347 348 /** Helper method to get all values in a list that appear after all occurrences of target. */ getAllValuesAfter(int target, List<Integer> l)349 private List<Integer> getAllValuesAfter(int target, List<Integer> l) { 350 return l.subList(l.lastIndexOf(target) + 1, l.size()); 351 } 352 } 353