@conference {2632, title = {A Comparison of Three Empirical Reliability Estimates for Computerized Adaptive Testing}, booktitle = {IACAT 2017 conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Reliability estimates in Computerized Adaptive Testing (CAT) are derived from estimated thetas and standard error of estimated thetas. In practical, the observed standard error (OSE) of estimated thetas can be estimated by test information function for each examinee with respect to Item response theory (IRT). Unlike classical test theory (CTT), OSEs in IRT are conditional values given each estimated thetas so that those values should be marginalized to consider test reliability. Arithmetic mean, Harmonic mean, and Jensen equality were applied to marginalize OSEs to estimate CAT reliability. Based on different marginalization method, three empirical CAT reliabilities were compared with true reliabilities. Results showed that three empirical CAT reliabilities were underestimated compared to true reliability in short test length (\< 40), whereas the magnitude of CAT reliabilities was followed by Jensen equality, Harmonic mean, and Arithmetic mean in long test length (\> 40). Specifically, Jensen equality overestimated true reliability across all conditions in long test length (\>50).

Session Video\ 

}, keywords = {CAT, Reliability}, url = {https://drive.google.com/file/d/1gXgH-epPIWJiE0LxMHGiCAxZZAwy4dAH/view?usp=sharing}, author = {Dong Gi Seo} } @conference {2099, title = {From Reliability to Validity: Expanding Adaptive Testing Practice to Find the Most Valid Score for Each Test Taker}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

CAT is an exception to the traditional conception of validity. It is one of the few examples of individualized testing. Item difficulty is tailored to each examinee. The intent, however, is increased efficiency. Focus on reliability (reduced standard error); Equivalence with paper \& pencil tests is valued; Validity is enhanced through improved reliability.

How Else Might We Individualize Testing Using CAT?

An ISV-Based View of Validity

Test Event -- An examinee encounters a series of items in a particular context.

CAT Goal: individualize testing to address CIV threats to score validity (i.e., maximize ISV).

Some Research Issues:

}, keywords = {CAT, CIV, construct-irrelevant variance, Individual Score Validity, ISV, low test taking motivation, Reliability, validity}, author = {Steven L. Wise} } @article {41, title = {An examination of the reliability and validity of performance ratings made using computerized adaptive rating scales}, journal = {Dissertation Abstracts International: Section B: The Sciences and Engineering}, volume = {61}, number = {1-B}, year = {2000}, pages = {570}, abstract = {This study compared the psychometric properties of performance ratings made using recently-developed computerized adaptive rating scales (CARS) to the psyc hometric properties of ratings made using more traditional paper-and-pencil rati ng formats, i.e., behaviorally-anchored and graphic rating scales. Specifically, the reliability, validity and accuracy of the performance ratings from each for mat were examined. One hundred twelve participants viewed six 5-minute videotape s of office situations and rated the performance of a target person in each vide otape on three contextual performance dimensions-Personal Support, Organizationa l Support, and Conscientious Initiative-using CARS and either behaviorally-ancho red or graphic rating scales. Performance rating properties were measured using Shrout and Fleiss{\textquoteright}s intraclass correlation (2, 1), Borman{\textquoteright}s differential accurac y measure, and Cronbach{\textquoteright}s accuracy components as indexes of rating reliability, validity, and accuracy, respectively. Results found that performance ratings mad e using the CARS were significantly more reliable and valid than performance rat ings made using either of the other formats. Additionally, CARS yielded more acc urate performance ratings than the paper-and-pencil formats. The nature of the C ARS system (i.e., its adaptive nature and scaling methodology) and its paired co mparison judgment task are offered as possible reasons for the differences found in the psychometric properties of the performance ratings made using the variou s rating formats. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Performance Tests, Rating Scales, Reliability, Test, Test Validity}, author = {Buck, D. E.} }